In [25]:
#pip install -q transformers

In [26]:
import sklearn
from sklearn.datasets import load_files
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import re
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [27]:
# Load the CSV file
file_path = 'election_day_tweets_data/election_day_tweets.csv' 
elections_2016 = pd.read_csv(file_path)

elections_2016.sort_values(by='created_at').info()


<class 'pandas.core.frame.DataFrame'>
Index: 397629 entries, 162840 to 92778
Data columns (total 34 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   text                           397629 non-null  object 
 1   created_at                     397629 non-null  object 
 2   geo                            2564 non-null    object 
 3   lang                           397629 non-null  object 
 4   place                          30832 non-null   object 
 5   coordinates                    2564 non-null    object 
 6   user.favourites_count          397629 non-null  int64  
 7   user.statuses_count            397629 non-null  int64  
 8   user.description               353403 non-null  object 
 9   user.location                  302677 non-null  object 
 10  user.id                        397629 non-null  int64  
 11  user.created_at                397629 non-null  object 
 12  user.verified                  

In [28]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure input is a string
        # Remove special characters, numbers, and extra whitespace
        # Remove mentions
        text = re.sub(r"@\w+", "", text)
        # Remove hashtags (but keep the words if desired)
        text = re.sub(r"#", "", text)  # Remove only the '#' symbol
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
        text = re.sub(r'\s+', ' ', text)        # Replace multiple spaces with a single space
        text = text.strip()                     # Remove leading/trailing spaces
    return text

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        #t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = re.sub(r"#", "", t)
        t = re.sub(r"#", "", t)  # Remove only the '#' symbol
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [30]:
elections_2016['text'] = elections_2016['text'].str.lower()  # Convert to lowercase
print(elections_2016['text'].head())
# Apply the cleaning function to the 'text' column
elections_2016['cleaned_text'] = elections_2016['text'].apply(preprocess)
print(elections_2016[['cleaned_text']].head())
# Define specific words to search for
specific_words = ['hillary', 'clinton']

# Combine the words into a single regular expression pattern
pattern = '|'.join(rf'\b{word}\b' for word in specific_words)  # Matches whole words only

# Filter rows where the cleaned text contains any of the specific words
trump_tweets = elections_2016[elections_2016['cleaned_text'].str.contains('trump', case=False, na=False)].copy()
hillary_tweets = elections_2016[elections_2016['cleaned_text'].str.contains(pattern, flags=re.IGNORECASE, na=False)].copy()


# Save the filtered results
#filtered_elections_2016.to_csv('filtered_words2.csv', index=False)

# Display example Hillary Tweets
print("Example Hillary Tweets:")
print(hillary_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Hillary

# Display example Trump Tweets
print("\nExample Trump Tweets:")
print(trump_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Trump


0      .@lawrence @hillaryclinton two first  @senschumer tomorrow. @thelastword #brooklyn  therealamerica #vote #democrats #nastywomenvote #senate
1    my @latimesopinion op-ed on historic #california #senate race. first time an elected woman senator succeeds another.\nhttps://t.co/cbjqtk0q1v
2                                        #senate wisconsin senate preview: johnson vs. feingold, the sequel https://t.co/xhq4p0v4el @senronjohnson
3           if rubio wins and #trump loses in #florida... #hillaryclinton #senate #republicanprimary #senaterace #miami... https://t.co/zienecvnmo
4                                #senate wisconsin senate preview: johnson vs. feingold, the sequel https://t.co/vsd6arfme5 senronjohnson nta•news
Name: text, dtype: object
                                                                                                                        cleaned_text
0                    .@lawrence @user two first  @user tomorrow. @user brooklyn  therealamerica vote democ

In [31]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
def compute_sentiment(text):
    if not isinstance(text, str) or text.strip() =="":
        return None
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # Assign labels to scores
    sentiment = {
        "negative": scores[0],
        "neutral": scores[1],
        "positive": scores[2],
    }
    return sentiment

In [33]:
trump_tweets_sample = trump_tweets.sample(10)  # Random 10 samples
hillary_tweets_sample = hillary_tweets.sample(10)
trump_tweets_sample['sentiment'] = trump_tweets_sample['cleaned_text'].apply(compute_sentiment)
hillary_tweets_sample['sentiment'] = hillary_tweets_sample['cleaned_text'].apply(compute_sentiment)

# Ensure full text is displayed
pd.set_option('display.max_colwidth', None)

print("\nExample Hillary Tweets with Sentiment:")
print(hillary_tweets_sample[['cleaned_text', 'sentiment']].head(10))
print("\nExample Trump Tweets with Sentiment:")
print(trump_tweets_sample[['cleaned_text', 'sentiment']].head(10))



Example Hillary Tweets with Sentiment:
                                                                                                                                         cleaned_text  \
251224                                                                  election2016 votehillary walks in like beyoncé "where's hillary's name " http   
186751     obama telling people to vote 4 hillary because she's a girl is not a good enough reason, it actually goes against feminism... election2016   
361864                                                 election2016 america votes: now a nation decides between hillary clinton and donald trump http   
39386    same ppl saying hillary gonna win are now the same ones saying if trump wins congress won't let him do anything! clearly we can't doubt him!   
61649     understanding that trump has to go through congress but just the fact he wants to do those things also not saying hillary is any better lol   
260708                                    

In [34]:
import re
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess
def preprocess_tweet(tweet):
    # Remove URLs, mentions, hashtags
    tweet = re.sub(r"http\S+|@\S+|#\S+", "", tweet)
    tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)  # Remove non-alphanumeric
    tweet = tweet.lower().strip()
    return tweet

# Example
tweet = "Check out @HillaryClinton's speech here: https://example.com #Election2016"
clean_tweet = preprocess_tweet(tweet)
print(clean_tweet)

# Named Entity Recognition to find entities
doc = nlp(clean_tweet)
entities = [ent.text for ent in doc.ents]
print(entities)


check out  speech here
[]


In [35]:
# Extract the dependency parse for "Trump"
for token in doc:
    if token.text.lower() == "trump":
        print(f"Token: {token.text}, Head: {token.head.text}, Children: {[child.text for child in token.children]}")


In [42]:
from transformers import pipeline

# Load a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels
labels = ["positive", "neutral", "negative"]

# Example tweet with target entity
tweet = "I Love Clinton. I hate Trump"
result = classifier(tweet, candidate_labels=labels)

print(result)
# Output: {'sequence': ..., 'labels': ['positive', 'neutral', 'negative'], 'scores': [0.7, 0.2, 0.1]}


{'sequence': 'I Love Clinton. I hate Trump', 'labels': ['positive', 'negative', 'neutral'], 'scores': [0.6290384531021118, 0.24697545170783997, 0.1239861473441124]}


In [43]:
import pandas as pd

print(tweet)
# Example data
data = {"entity": ["Trump", "Clinton", "Trump"], "sentiment": ["positive", "negative", "neutral"]}
df = pd.DataFrame(data)

# Aggregate sentiment counts
sentiment_summary = df.groupby("entity")["sentiment"].value_counts().unstack(fill_value=0)
print(sentiment_summary)


I Love Clinton. I hate Trump
sentiment  negative  neutral  positive
entity                                
Clinton           1        0         0
Trump             0        1         1


In [38]:
#trump_tweets['sentiment'] = trump_tweets['cleaned_text'].apply(compute_sentiment)
#hillary_tweets['sentiment'] = hillary_tweets['cleaned_text'].apply(compute_sentiment)

print("\nExample Hillary Tweets with Sentiment:")
print(hillary_tweets[['cleaned_text', 'sentiment']].head())
print("\nExample Trump Tweets with Sentiment:")
print(trump_tweets[['cleaned_text', 'sentiment']].head())



Example Hillary Tweets with Sentiment:


KeyError: "['sentiment'] not in index"

In [None]:
encoded_input = tokenizer(elections_2016['cleaned_text'], return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)