In [77]:
import sklearn
from sklearn.datasets import load_files
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import re

In [78]:
# Download VADER's required data
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [79]:
# Example sentences
texts = [
    "I love this product, it's amazing!",
    "This is terrible, I hate it.",
    "Meh, it was okay but not great."
]

# Analyze each text
for text in texts:
    scores = analyzer.polarity_scores(text)
    print(f"Text: {text}")
    print(f"Scores: {scores}")

Text: I love this product, it's amazing!
Scores: {'neg': 0.0, 'neu': 0.266, 'pos': 0.734, 'compound': 0.8516}
Text: This is terrible, I hate it.
Scores: {'neg': 0.694, 'neu': 0.306, 'pos': 0.0, 'compound': -0.7783}
Text: Meh, it was okay but not great.
Scores: {'neg': 0.506, 'neu': 0.362, 'pos': 0.131, 'compound': -0.6299}


In [80]:
# Load the CSV file
file_path = 'election_day_tweets_data/election_day_tweets.csv'  # Replace with the actual file name
elections_2016 = pd.read_csv(file_path)

elections_2016.sort_values(by='created_at').info()


<class 'pandas.core.frame.DataFrame'>
Index: 397629 entries, 162840 to 92778
Data columns (total 34 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   text                           397629 non-null  object 
 1   created_at                     397629 non-null  object 
 2   geo                            2564 non-null    object 
 3   lang                           397629 non-null  object 
 4   place                          30832 non-null   object 
 5   coordinates                    2564 non-null    object 
 6   user.favourites_count          397629 non-null  int64  
 7   user.statuses_count            397629 non-null  int64  
 8   user.description               353403 non-null  object 
 9   user.location                  302677 non-null  object 
 10  user.id                        397629 non-null  int64  
 11  user.created_at                397629 non-null  object 
 12  user.verified                  

In [83]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure input is a string
        # Remove special characters, numbers, and extra whitespace
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        text = re.sub(r'\s+', ' ', text)        # Replace multiple spaces with a single space
        text = text.strip()                     # Remove leading/trailing spaces
    return text

In [84]:
elections_2016['text'] = elections_2016['text'].str.lower()  # Convert to lowercase

# Apply the cleaning function to the 'text' column
elections_2016['cleaned_text'] = elections_2016['text'].apply(clean_text)
#print(elections_2016[['cleaned_text']].head())

# Define specific words to search for
specific_words = ['hillary', 'clinton']

# Combine the words into a single regular expression pattern
pattern = '|'.join(rf'\b{word}\b' for word in specific_words)  # Matches whole words only

# Filter rows where the cleaned text contains any of the specific words
hillary_tweets = elections_2016[elections_2016['cleaned_text'].str.contains(pattern, flags=re.IGNORECASE, na=False)]
# Filter tweets related to Trump
trump_tweets = elections_2016[elections_2016['cleaned_text'].str.contains('trump', case=False, na=False)]

# Save the filtered results
filtered_elections_2016.to_csv('filtered_words2.csv', index=False)

# Display example Hillary Tweets
print("Example Hillary Tweets:")
print(hillary_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Hillary

# Display example Trump Tweets
print("\nExample Trump Tweets:")
print(trump_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Trump


Example Hillary Tweets:


KeyError: "['sentiment', 'compound_score'] not in index"

In [None]:
#elections_2016['text'] = elections_2016['text'].str.lower()

#elections_2016['text'] = elections_2016['text'].astype(str)  # Convert 'text' column to string data type

#elections_2016['tokens'] = elections_2016['text'].apply(nltk.word_tokenize)  # Tokenization

#elections_2016 = elections_2016.head(100)  # This selects the first 100 rows


# Remove stopwords
#stopwords = nltk.corpus.stopwords.words('english')
#elections_2016['tokens'] = elections_2016['tokens'].apply(lambda x: [word for word in x if word not in stopwords])
#elections_2016['text']


In [None]:

# Analyze sentiment for each row
elections_2016['sentiment_scores'] = elections_2016['cleaned_text'].apply(lambda x: analyzer.polarity_scores(x))
elections_2016['compound_score'] = elections_2016['sentiment_scores'].apply(lambda x: x['compound'])
elections_2016['sentiment'] = elections_2016['compound_score'].apply(
    lambda x: "Positive" if x > 0.05 else "Negative" if x < -0.05 else "Neutral"
)

# Save the results
elections_2016.to_csv('election_sentiment_results.csv', index=False)

In [None]:
# Example Positive Tweets
positive_tweets = hillary_tweets[hillary_tweets['sentiment'] == 'Positive']
print("Example Positive Hillary Tweets:")
print(positive_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 positive tweets

# Example Negative Tweets
negative_tweets = hillary_tweets[hillary_tweets['sentiment'] == 'Negative']
print("\nExample Hillary Negative Tweets:")
print(negative_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 negative tweets

# Example Neutral Tweets
neutral_tweets = hillary_tweets[hillary_tweets['sentiment'] == 'Neutral']
print("\nExample Hillary Neutral Tweets:")
print(neutral_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 neutral tweets


In [None]:
# Example Positive Tweets
positive_tweets = trump_tweets[trump_tweets['sentiment'] == 'Positive']
print("Example Positive Trump Tweets:")
print(positive_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 positive tweets

# Example Negative Tweets
negative_tweets = trump_tweets[trump_tweets['sentiment'] == 'Negative']
print("\nExample Trump Negative Tweets:")
print(negative_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 negative tweets

# Example Neutral Tweets
neutral_tweets = trump_tweets[trump_tweets['sentiment'] == 'Neutral']
print("\nExample Trump Neutral Tweets:")
print(neutral_tweets[['cleaned_text', 'sentiment', 'compound_score']].head())  # Show the first 5 neutral tweets
