In [1]:
pip install -q transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sklearn
from sklearn.datasets import load_files
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import re
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [3]:
# Load the CSV file
file_path = 'election_day_tweets_data/election_day_tweets.csv' 
elections_2016 = pd.read_csv(file_path)

elections_2016.sort_values(by='created_at').info()


<class 'pandas.core.frame.DataFrame'>
Index: 397629 entries, 162840 to 92778
Data columns (total 34 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   text                           397629 non-null  object 
 1   created_at                     397629 non-null  object 
 2   geo                            2564 non-null    object 
 3   lang                           397629 non-null  object 
 4   place                          30832 non-null   object 
 5   coordinates                    2564 non-null    object 
 6   user.favourites_count          397629 non-null  int64  
 7   user.statuses_count            397629 non-null  int64  
 8   user.description               353403 non-null  object 
 9   user.location                  302677 non-null  object 
 10  user.id                        397629 non-null  int64  
 11  user.created_at                397629 non-null  object 
 12  user.verified                  

In [4]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure input is a string
        # Remove special characters, numbers, and extra whitespace
        # Remove mentions
        text = re.sub(r"@\w+", "", text)
        # Remove hashtags (but keep the words if desired)
        text = re.sub(r"#", "", text)  # Remove only the '#' symbol
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        text = re.sub(r'\s+', ' ', text)        # Replace multiple spaces with a single space
        text = text.strip()                     # Remove leading/trailing spaces
    return text

In [6]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [8]:
elections_2016['text'] = elections_2016['text'].str.lower()  # Convert to lowercase
print(elections_2016['text'].head())
# Apply the cleaning function to the 'text' column
elections_2016['cleaned_text'] = elections_2016['text'].apply(preprocess)
print(elections_2016[['cleaned_text']].head())
# Define specific words to search for
specific_words = ['hillary', 'clinton']

# Combine the words into a single regular expression pattern
pattern = '|'.join(rf'\b{word}\b' for word in specific_words)  # Matches whole words only

# Filter rows where the cleaned text contains any of the specific words
trump_tweets = elections_2016[elections_2016['cleaned_text'].str.contains('trump', case=False, na=False)].copy()
hillary_tweets = elections_2016[elections_2016['cleaned_text'].str.contains(pattern, flags=re.IGNORECASE, na=False)].copy()


# Save the filtered results
#filtered_elections_2016.to_csv('filtered_words2.csv', index=False)

# Display example Hillary Tweets
print("Example Hillary Tweets:")
print(hillary_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Hillary

# Display example Trump Tweets
print("\nExample Trump Tweets:")
print(trump_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Trump


0    .@lawrence @hillaryclinton two first  @senschu...
1    my @latimesopinion op-ed on historic #californ...
2    #senate wisconsin senate preview: johnson vs. ...
3    if rubio wins and #trump loses in #florida... ...
4    #senate wisconsin senate preview: johnson vs. ...
Name: text, dtype: object
                                        cleaned_text
0  .@lawrence @user two first  @user tomorrow. @u...
1  my @user op-ed on historic #california #senate...
2  #senate wisconsin senate preview: johnson vs. ...
3  if rubio wins and #trump loses in #florida... ...
4  #senate wisconsin senate preview: johnson vs. ...
Example Hillary Tweets:
                                         cleaned_text
26  even if #hillary wins, nothing will change unl...
46  agree, the after effect may humble gop even mo...
57  politico says democrats are on the brink of ta...
60  #senate franken hits the road for clinton in m...
62  #senate franken hits the road for clinton in m...

Example Trump Tweets:
          

In [9]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def compute_sentiment(text):
    if not isinstance(text, str) or text.strip() =="":
        return None
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # Assign labels to scores
    sentiment = {
        "negative": scores[0],
        "neutral": scores[1],
        "positive": scores[2],
    }
    return sentiment

In [None]:
trump_tweets['sentiment'] = trump_tweets['cleaned_text'].apply(compute_sentiment)
hillary_tweets['sentiment'] = hillary_tweets['cleaned_text'].apply(compute_sentiment)

print("\nExample Hillary Tweets with Sentiment:")
print(hillary_tweets[['cleaned_text', 'sentiment']].head())
print("\nExample Trump Tweets with Sentiment:")
print(trump_tweets[['cleaned_text', 'sentiment']].head())


In [None]:
encoded_input = tokenizer(elections_2016['cleaned_text'], return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)