Imports and load models

In [7]:
#pip install transformers[sentencepiece]

import sklearn
from sklearn.datasets import load_files
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import zipfile
import spacy
import re
import torch
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import pipeline

# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path)



Example for how the code works


In [9]:
sentence = "We had a great experience at the restaurant, food was delicious, but " \
  "the service was kinda bad"
print(f"Sentence: {sentence}")
print()

aspect = "food"
inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
outputs = absa_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
probs = probs.detach().numpy()[0]
print(f"Sentiment of aspect '{aspect}' is:")
for prob, label in zip(probs, ["negative", "neutral", "positive"]):
  print(f"Label {label}: {prob}")
print()

# ABSA of "service"
aspect = "service"
inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
outputs = absa_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
probs = probs.detach().numpy()[0]
print(f"Sentiment of aspect '{aspect}' is:")
for prob, label in zip(probs, ["negative", "neutral", "positive"]):
  print(f"Label {label}: {prob}")
print()
# Sentiment of aspect 'service' is:
# Label negative: 0.9946129322052002
# Label neutral: 0.002369985682889819
# Label positive: 0.003017079783603549

# Overall sentiment of the sentence
sentiment = sentiment_model([sentence])[0]
print(f"Overall sentiment: {sentiment['label']} with score {sentiment['score']}")
# Overall sentiment: Negative with score 0.7706006765365601

Sentence: We had a great experience at the restaurant, food was delicious, but the service was kinda bad

Sentiment of aspect 'food' is:
Label negative: 0.000998911913484335
Label neutral: 0.0018238150514662266
Label positive: 0.997177243232727

Sentiment of aspect 'service' is:
Label negative: 0.9946129322052002
Label neutral: 0.0023699868470430374
Label positive: 0.003017081180587411

Overall sentiment: negative with score 0.7706007361412048


Function for analysing sentiment of different aspects 

In [34]:
# def analyze_aspects_sentiment(sentence, aspects, absa_tokenizer, absa_model):
#     for aspect in aspects:
#         # Tokenize the input
#         inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
        
#         # Get the model outputs
#         outputs = absa_model(**inputs)
        
#         # Apply softmax to obtain probabilities
#         probs = F.softmax(outputs.logits, dim=1)
#         probs = probs.detach().numpy()[0]  # Convert tensor to numpy array
        
#         # Print the sentiment probabilities
#         print(f"Sentiment of aspect '{aspect}' is:")
#         for prob, label in zip(probs, ["negative", "neutral", "positive"]):
#             print(f"  {label.capitalize()}: {prob:.4f}")
#         print()  # Add a newline for better readability
import torch.nn.functional as F

def analyze_aspects_sentiment(sentence, aspects, absa_tokenizer, absa_model, sentiment_model):
    sentiment_results = {}

    # Analyze sentiment for each aspect (e.g., food, service)
    for aspect in aspects:
        if aspect.lower() not in sentence.lower():
            print(f"Aspect '{aspect}' not found in the sentence.")
            continue  # Skip sentiment analysis for this aspect
        inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
        outputs = absa_model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        probs = probs.detach().numpy()[0]
        
        # Format aspect sentiment result
        sentiment_results[aspect] = {
            "negative": probs[0],
            "neutral": probs[1],
            "positive": probs[2]
        }

        # Print the sentiment of the aspect in the requested format
        print(f"Sentiment of aspect '{aspect}' is:")
        for label, prob in sentiment_results[aspect].items():
            print(f"Label {label}: {prob}")
        print()

    # Overall sentiment of the sentence
    sentiment = sentiment_model([sentence])[0]
    print(f"Overall sentiment: {sentiment['label']} with score {sentiment['score']}")

    return sentiment_results

Load dataset

In [11]:
# Load the CSV file
file_path = 'election_day_tweets_data/election_day_tweets.csv' 
elections_2016 = pd.read_csv(file_path)

elections_2016.sort_values(by='created_at').info()

<class 'pandas.core.frame.DataFrame'>
Index: 397629 entries, 162840 to 92778
Data columns (total 34 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   text                           397629 non-null  object 
 1   created_at                     397629 non-null  object 
 2   geo                            2564 non-null    object 
 3   lang                           397629 non-null  object 
 4   place                          30832 non-null   object 
 5   coordinates                    2564 non-null    object 
 6   user.favourites_count          397629 non-null  int64  
 7   user.statuses_count            397629 non-null  int64  
 8   user.description               353403 non-null  object 
 9   user.location                  302677 non-null  object 
 10  user.id                        397629 non-null  int64  
 11  user.created_at                397629 non-null  object 
 12  user.verified                  

Preprocess tweets

In [18]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        #t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = re.sub(r"#", "", t)
        t = re.sub(r"#", "", t)  # Remove only the '#' symbol
        #t = 'http' if t.startswith('http') else t
        #remove links (URLs)
        t = re.sub(r'http\S+|www\S+|https\S+', '', t)
        new_text.append(t)
        t = t.lower().strip()

    return " ".join(new_text)

Test on some tweets:

In [32]:
# Set pandas display options to show the full content of cells
pd.set_option('display.max_colwidth', None)  # Show full text in columns
pd.set_option('display.max_rows', None)  # Optional: Show all rows if needed

elections_2016['text'] = elections_2016['text'].str.lower()  # Convert to lowercase
print(elections_2016['text'].head())
# Apply the cleaning function to the 'text' column
elections_2016['cleaned_text'] = elections_2016['text'].apply(preprocess)
print(elections_2016[['cleaned_text']].head())
# Define specific words to search for
specific_words = ['hillary', 'clinton']
our_specific_words = ['hillary', 'clinton', 'trump']

# Combine the words into a single regular expression pattern
pattern = '|'.join(rf'\b{word}\b' for word in specific_words)  # Matches whole words only
pattern2 = '|'.join(rf'\b{word}\b' for word in our_specific_words)  # Matches whole words only

# Filter rows where the cleaned text contains any of the specific words
trump_tweets = elections_2016[elections_2016['cleaned_text'].str.contains('trump', case=False, na=False)].copy()
hillary_tweets = elections_2016[elections_2016['cleaned_text'].str.contains(pattern, flags=re.IGNORECASE, na=False)].copy()
interesting_tweets = elections_2016[elections_2016['cleaned_text'].str.contains(pattern2, flags=re.IGNORECASE, na=False)].copy()
# Save the filtered results
#filtered_elections_2016.to_csv('filtered_words2.csv', index=False)

# Display example Hillary Tweets
print("Example Hillary Tweets:")
print(hillary_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Hillary

# Display example Trump Tweets
print("\nExample Trump Tweets:")
print(trump_tweets[['cleaned_text']].head())  # Show the first 5 tweets related to Trump


0      .@lawrence @hillaryclinton two first  @senschumer tomorrow. @thelastword #brooklyn  therealamerica #vote #democrats #nastywomenvote #senate
1    my @latimesopinion op-ed on historic #california #senate race. first time an elected woman senator succeeds another.\nhttps://t.co/cbjqtk0q1v
2                                        #senate wisconsin senate preview: johnson vs. feingold, the sequel https://t.co/xhq4p0v4el @senronjohnson
3           if rubio wins and #trump loses in #florida... #hillaryclinton #senate #republicanprimary #senaterace #miami... https://t.co/zienecvnmo
4                                #senate wisconsin senate preview: johnson vs. feingold, the sequel https://t.co/vsd6arfme5 senronjohnson nta•news
Name: text, dtype: object
                                                                                                                             cleaned_text
0  .@lawrence @hillaryclinton two first  @senschumer tomorrow. @thelastword brooklyn  therealamerica 

Actual test:

In [36]:
# Define keywords
trump_keywords = ["trump"]
hillary_keywords = ["hillary", "clinton"]
all_keywords = ["trump", "hillary", "clinton"]

# Sample 10 tweets for Trump and Hillary
trump_tweets_sample = trump_tweets.sample(10)  # Random 10 samples
hillary_tweets_sample = hillary_tweets.sample(10)
interesting_tweets_sample = interesting_tweets.sample(10)

# Apply keyword-specific sentiment analysis
# print("Trump tweets and sentiment")
# for _, tweet in trump_tweets_sample.iterrows():  # Use iterrows to loop over rows
#     print(f"Tweet: {tweet['cleaned_text']}")
#     sentiment_results_trump = analyze_aspects_sentiment(
#         tweet['cleaned_text'], trump_keywords, absa_tokenizer, absa_model, sentiment_model
#     )
#     print()

# print("Hillary tweets and sentiment")
# for _, tweet in hillary_tweets_sample.iterrows():  # Use iterrows to loop over rows
#     print(f"Tweet: {tweet['cleaned_text']}")
#     sentiment_results_hillary = analyze_aspects_sentiment(
#         tweet['cleaned_text'], hillary_keywords, absa_tokenizer, absa_model, sentiment_model
#     )
#     print()

# Apply keyword-specific sentiment analysis
print("Overall tweets and sentiment")
for _, tweet in interesting_tweets_sample.iterrows():  # Use iterrows to loop over rows
    print(f"Tweet: {tweet['cleaned_text']}")
    sentiment_results_tweets = analyze_aspects_sentiment(
        tweet['cleaned_text'], all_keywords, absa_tokenizer, absa_model, sentiment_model
    )
    print()

Overall tweets and sentiment
Tweet: donald trump elected as 45th us president, defeating democrat nominee hillaryclinton , per us media
Sentiment of aspect 'trump' is:
Label negative: 0.02339293248951435
Label neutral: 0.5201062560081482
Label positive: 0.456500768661499

Sentiment of aspect 'hillary' is:
Label negative: 0.8718752264976501
Label neutral: 0.12266005575656891
Label positive: 0.0054647307842969894

Sentiment of aspect 'clinton' is:
Label negative: 0.864477813243866
Label neutral: 0.12955081462860107
Label positive: 0.0059714107774198055

Overall sentiment: negative with score 0.5205474495887756

Tweet: borthers and sisters, please get out and vote for hillary clinton imwithher2016 
Aspect 'trump' not found in the sentence.
Sentiment of aspect 'hillary' is:
Label negative: 0.0034507703967392445
Label neutral: 0.017090870067477226
Label positive: 0.9794583320617676

Sentiment of aspect 'clinton' is:
Label negative: 0.005371887236833572
Label neutral: 0.059505634009838104
La