In [None]:
import joblib, pickle
import tweepy as tw
from textblob import TextBlob,Word
from TSA_pipeline import *
import snscrape.modules.twitter as sntwitter
import GetOldTweets3 as got
from datetime import datetime
from matplotlib.dates import date2num
from matplotlib import dates as mpl_dates
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score
import numpy as np

### Load Data

In [None]:
# load the entire csv file into panda data frames
data_DT = pd.read_csv('TSA_datasets/US Elections 2020/hashtag_donaldtrump.csv',lineterminator='\n', parse_dates=True)
data_JB = pd.read_csv('TSA_datasets/US Elections 2020/hashtag_joebiden.csv',lineterminator='\n', parse_dates=True)

In [None]:
# keep only the relevant columns from the datasets
tweets_DT = pd.DataFrame(columns=['created_at','tweet','user_followers_count'],data=data_DT)
tweets_JB = pd.DataFrame(columns=['created_at','tweet','user_followers_count'],data=data_JB)

In [None]:
# rename columns
tweets_DT.rename(columns={'created_at':'Timestamp','tweet':'text'},inplace=True)
tweets_JB.rename(columns={'created_at':'Timestamp','tweet':'text'},inplace=True)
tweets_DT['tweet']=tweets_DT['text']
tweets_JB['tweet']=tweets_JB['text']

### Make predictions

In [None]:
def subjectivity_filtering(df):
    print('filtering data using subjectivity detection')
    df=df[df['subjectivity']>=0.5]
    return df

In [None]:
def transform(df):
    print('transforming data')
    
    # vectorize
    vectorizer = pickle.load(open("transformers/twitter/vectorizer.pickle", "rb"))
    feature_vector = vectorizer.transform(df['text'].astype(str))
    
    # defining variables
    X_test = feature_vector
    y_test = df['textblob']
    
    # select features
    selector = pickle.load(open("transformers/twitter/selector.pickle", "rb"))
    X_test = selector.transform(X_test)
    
    return X_test, y_test

In [None]:
def bert_predictions(df):
    print("Making predictions using bert-based model")
    # instantiate model
    tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    
    # convert bert predictions between -1 and 1 as required for this comparative analysis
    bert_predictions_converter = {
        0:-1,
        1:-0.5,
        2:0,
        3:0.5,
        4:1
    }
    # make predictions one by one and pass through converter
    df['bert'] = df['text'].apply(lambda x:bert_predictions_converter[int(torch.argmax((model(tokenizer.encode(x, return_tensors='pt'))).logits))])

In [None]:
def textblob_predictions(df):
    print("Making TextBlob Predictions")
    # make Textblob predictions
    df['textblob'] = df['text'].apply(lambda x: TextBlob(x).sentiment[0])
    df['subjectivity'] = df['text'].apply(lambda x: TextBlob(x).sentiment[1])
    return df

In [None]:
# make Textblob predictions
tweets_DT = textblob_predictions(tweets_DT)
    
# apply subjectivity filtering
tweets_DT = subjectivity_filtering(tweets_DT)
print(len(tweets_DT))
    
# clean data
tweets_DT = cleaning_data(tweets_DT)

# transform into feature vector with selected features
X_test, y_test = transform(tweets_DT)

# Evaluation
models = {}
# models trained using the amazon Reviews Dataset
# models['LR'] = joblib.load('models/final models/binary class/LR.pk1')
# models['MNB'] = joblib.load('models/final models/binary class/MNB.pk1')
# models['SVM'] = joblib.load('models/final models/binary class/SVM.pk1')
# models['MLP'] = joblib.load('models/final models/binary class/MLP.pk1')

# models trained using the Twitter Dataset
models['LR'] = joblib.load('models/twitter/LR_new.pk1')
models['MNB'] = joblib.load('models/twitter/MNB_new.pk1')
models['SVM'] = joblib.load('models/twitter/SVM_new.pk1')
models['MLP'] = joblib.load('models/twitter/MLP.pk1')
for model in models.keys():
    predictions = models[model].predict(X_test)
    tweets_DT['predictions_{}'.format(model)] = predictions.tolist()

In [None]:
# make Textblob predictions
textblob_predictions(tweets_JB)
    
# apply subjectivity filtering
tweets_JB = subjectivity_filtering(tweets_JB)
print(len(tweets_JB))
    
# clean data
tweets_JB = cleaning_data(tweets_JB)

# transform into feature vector with selected features
X_test, y_test = transform(tweets_JB)

# Evaluation
models = {}
# models trained using the amazon Reviews Dataset
# models['LR'] = joblib.load('models/final models/binary class/LR.pk1')
# models['MNB'] = joblib.load('models/final models/binary class/MNB.pk1')
# models['SVM'] = joblib.load('models/final models/binary class/SVM.pk1')
# models['MLP'] = joblib.load('models/final models/binary class/MLP.pk1')

# models trained using the Twitter Dataset
models['LR'] = joblib.load('models/twitter/LR_new.pk1')
models['MNB'] = joblib.load('models/twitter/MNB_new.pk1')
models['SVM'] = joblib.load('models/twitter/SVM_new.pk1')
models['MLP'] = joblib.load('models/twitter/MLP.pk1')
for model in models.keys():
    predictions = models[model].predict(X_test)
    tweets_JB['predictions_{}'.format(model)] = predictions.tolist()

### Rolling Mean Graph

In [None]:
tweets_DT = tweets_DT.sort_values(by='Timestamp',ascending=True)
polarity = {
    'positive':1,
    'negative':-1
}
rollingMean = 10000
rollingMeanTB = 10000
minPeriods = 100
tweets_DT['predictions1_LR'] = tweets_DT['predictions_LR'].apply(lambda x:polarity[x])
tweets_DT['predictions1_MNB'] = tweets_DT['predictions_MNB'].apply(lambda x:polarity[x])
tweets_DT['predictions1_SVM'] = tweets_DT['predictions_SVM'].apply(lambda x:polarity[x])
tweets_DT['predictions1_MLP'] = tweets_DT['predictions_MLP'].apply(lambda x:polarity[x])
tweets_DT['rolling_LR'] = tweets_DT.predictions1_LR.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_DT['rolling_MNB'] = tweets_DT.predictions1_MNB.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_DT['rolling_SVM'] = tweets_DT.predictions1_SVM.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_DT['rolling_MLP'] = tweets_DT.predictions1_MLP.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_DT['rolling_TextBlob'] = tweets_DT.textblob.rolling(rollingMeanTB,min_periods=minPeriods).mean()

In [None]:
tweets_JB = tweets_JB.sort_values(by='Timestamp',ascending=True)
polarity = {
    'positive':1,
    'negative':-1
}
rollingMean = 5000
rollingMeanTB = 5000
minPeriods = 100
tweets_JB['predictions1_LR'] = tweets_JB['predictions_LR'].apply(lambda x:polarity[x])
tweets_JB['predictions1_MNB'] = tweets_JB['predictions_MNB'].apply(lambda x:polarity[x])
tweets_JB['predictions1_SVM'] = tweets_JB['predictions_SVM'].apply(lambda x:polarity[x])
tweets_JB['predictions1_MLP'] = tweets_JB['predictions_MLP'].apply(lambda x:polarity[x])
tweets_JB['rolling_LR'] = tweets_JB.predictions1_LR.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_JB['rolling_MNB'] = tweets_JB.predictions1_MNB.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_JB['rolling_SVM'] = tweets_JB.predictions1_SVM.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_JB['rolling_MLP'] = tweets_JB.predictions1_MLP.rolling(rollingMean,min_periods=minPeriods).mean()
tweets_JB['rolling_TextBlob'] = tweets_JB.textblob.rolling(rollingMeanTB,min_periods=minPeriods).mean()

In [None]:
# string to datetime
tweets_DT['Timestamp'] =  pd.to_datetime(tweets_DT['Timestamp'])
tweets_JB['Timestamp'] =  pd.to_datetime(tweets_JB['Timestamp'])

In [None]:
def plot_rolling_predictions_graphs(df,name):
    # make 'Timestamp' the index
    df_new =  df.copy()
    df_new.set_index('Timestamp',inplace=True)

    plt.figure(figsize=(14,7))
    sns.lineplot(data=df_new[['rolling_LR','rolling_MNB','rolling_SVM','rolling_MLP','rolling_TextBlob']])
    plt.legend(['LR','MNB','SVM','MLP','TextBlob'])
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Mean Sentiment",fontsize = 15)
    plt.title(name,fontsize = 20)
    plt.gcf().autofmt_xdate()

In [None]:
plot_rolling_predictions_graphs(tweets_DT,'Classifiers Comparison')

In [None]:
def plot_rolling_predictions_graphs(df,name,col):
    # make 'Timestamp' the index
    df_new =  df.copy()
    df_new.set_index('Timestamp',inplace=True)
    df_new['MLP'] = df_new['rolling_MLP']

    plt.figure(figsize=(14,7))
    sns.lineplot(data=df_new['MLP'],color=col)
    plt.axvspan(date2num(datetime(2020,10,22)), date2num(datetime(2020,10,22)), 
           label="Last Presidential Debate",color=col, alpha=0.2)
    plt.axvspan(date2num(datetime(2020,11,3)), date2num(datetime(2020,11,3)), 
       label="Election Day",color=col, alpha=0.6)
    plt.legend()
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Mean Sentiment",fontsize = 15)
    plt.title(name,fontsize = 20)
    plt.gcf().autofmt_xdate()

In [None]:
plot_rolling_predictions_graphs(tweets_DT,'Trump','red')
plot_rolling_predictions_graphs(tweets_JB,'Biden','blue')

### positive / negative tweets examples

In [None]:
for index in range(10):
    print('\n',tweets_JB[(tweets_JB['textblob']==1) & (tweets_JB['predictions1_MLP']==1)& (tweets_JB['predictions1_SVM']==1)& (tweets_JB['predictions1_MNB']==1)& (tweets_JB['predictions1_LR']==1)].tweet.values[index])

### positive/negative/all table

In [None]:
tweets_DT['predictions1_MLP'].value_counts()

In [None]:
tweets_JB['predictions1_MLP'].value_counts()

### The three Graphs

In [None]:
# Number of Tweets per DAY
def format_dataframes(df):
    df = pd.DataFrame(data=df,columns=['Timestamp'])
    df = df.reset_index()
    df.drop('index',axis=1,inplace=True)
    df = df.reset_index()
    df['Timestamp'] = df['Timestamp'].dt.strftime('%m-%d')
    df = df.groupby(['Timestamp'])['index'].count()
    return df

def tweets_per_day(sentiment):
    if (sentiment=='Positive'):
        sentiment_value = 1
    else:
        sentiment_value = -1
    plt.figure(figsize=(14,7))
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Number of Tweets",fontsize = 15)
    plt.title('Number of Tweets per Day - {}'.format(sentiment),fontsize = 20)
    sns.lineplot(data=format_dataframes(tweets_DT[tweets_DT['predictions1_MLP']==sentiment_value]),color='red')
    sns.lineplot(data=format_dataframes(tweets_JB[tweets_JB['predictions1_MLP']==sentiment_value]),color='blue')
    plt.legend(['Trump','Biden'])
    plt.gcf().autofmt_xdate()

tweets_per_day('Positive')
tweets_per_day('Negative')

In [None]:
# Number of Tweets per HOUR
def format_dataframes(df):
    df = pd.DataFrame(data=df,columns=['Timestamp'])
    df = df.reset_index()
    df.drop('index',axis=1,inplace=True)
    df = df.reset_index()
    df['Timestamp'] = df['Timestamp'].dt.strftime('%m-%d-%H')
    df = df.groupby(['Timestamp'])['index'].count()
    df.index = pd.to_datetime(df.index,format='%m-%d-%H')
    return df
    
def tweets_per_hour(sentiment):
    if (sentiment=='Positive'):
        sentiment_value = 1
    else:
        sentiment_value = -1
    plt.figure(figsize=(20,7))
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Number of Tweets",fontsize = 15)
    plt.title('Number of Tweets per Hour - {}'.format(sentiment),fontsize = 20)
    date_format = mpl_dates.DateFormatter('%m-%d')
    sns.lineplot(data=format_dataframes(tweets_DT[tweets_DT['predictions1_MLP']==sentiment_value]),color='red')
    sns.lineplot(data=format_dataframes(tweets_JB[tweets_JB['predictions1_MLP']==sentiment_value]),color='blue')
    plt.gca().xaxis.set_major_formatter(date_format)
    plt.gcf().autofmt_xdate()
    plt.axvspan(date2num(datetime(1900,10,22)), date2num(datetime(1900,10,22)),color='green', alpha=0.9)
    plt.axvspan(date2num(datetime(1900,11,3)), date2num(datetime(1900,11,3)),color='brown', alpha=0.9)
    plt.legend(['Trump','Biden','Presidential Debate','Election Day'])
    
tweets_per_hour('Positive')
tweets_per_hour('Negative')

In [None]:
# Negative to Positive Ratio
def format_dataframes(df,sentiment):
    if (sentiment=='positive'):
        sentiment_value = 1
    else:
        sentiment_value = -1
    df = pd.DataFrame(data=df[df['predictions1_MLP']==sentiment_value],columns=['Timestamp'])
    df = df.reset_index()
    df.drop('index',axis=1,inplace=True)
    df = df.reset_index()
    df['Timestamp'] = df['Timestamp'].dt.strftime('%m-%d-%H')
    df = df.groupby(['Timestamp'])['index'].count()
    df.index = pd.to_datetime(df.index,format='%m-%d-%H')
    df = df.to_frame()
    df = df.reset_index()
    if (sentiment=='positive'):
        df.rename(columns={'index':'positive_count'},inplace=True)
    else:
        df.rename(columns={'index':'negative_count'},inplace=True)
    return df

def calculate_ratio(df):
    df_positive = format_dataframes(df,'positive')
    df_negative = format_dataframes(df,'negative')
    df_final = df_positive
    df_final['negative_count'] = df_negative['negative_count']
    df_final['neg_to_pos_ratio'] = df_final['negative_count']/df_final['positive_count']
    return df_final

def plot_ratio_graph(df1,df2):
    plt.figure(figsize=(20,8))
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Negative to Positive Ratio",fontsize = 15)
    plt.title('Number of Tweets per Hour',fontsize = 20)
    date_format = mpl_dates.DateFormatter('%m-%d')
    sns.lineplot(x='Timestamp',y='neg_to_pos_ratio',data=df1,color='red')
    sns.lineplot(x='Timestamp',y='neg_to_pos_ratio',data=df2,color='blue')
    plt.gca().xaxis.set_major_formatter(date_format)
    plt.gcf().autofmt_xdate()
    plt.axvspan(date2num(datetime(1900,10,22)), date2num(datetime(1900,10,22)),color='green', alpha=0.9)
    plt.axvspan(date2num(datetime(1900,11,3)), date2num(datetime(1900,11,3)),color='brown', alpha=0.9)
    plt.legend(['Trump','Biden','Presidential Debate','Election Day'])
    
def plot_mean_graph(trump_ratio,biden_ratio):
    trump_ratio['mean']=trump_ratio['neg_to_pos_ratio'].rolling(10,min_periods=10).mean()
    biden_ratio['mean']=biden_ratio['neg_to_pos_ratio'].rolling(10,min_periods=10).mean()

    plt.figure(figsize=(20,8))
    plt.xlabel("Date",fontsize = 15)
    plt.ylabel("Mean Negative to Positive Ratio",fontsize = 15)
    plt.title('Number of Tweets per Hour',fontsize = 20)
    date_format = mpl_dates.DateFormatter('%m-%d')
    sns.lineplot(x='Timestamp',y='mean',data=trump_ratio,color='red')
    sns.lineplot(x='Timestamp',y='mean',data=biden_ratio,color='blue')
    plt.gca().xaxis.set_major_formatter(date_format)
    plt.gcf().autofmt_xdate()
    plt.axvspan(date2num(datetime(1900,10,22)), date2num(datetime(1900,10,22)),color='green', alpha=0.9)
    plt.axvspan(date2num(datetime(1900,11,3)), date2num(datetime(1900,11,3)),color='brown', alpha=0.9)
    plt.legend(['Trump','Biden','Presidential Debate','Election Day'])

trump_ratio = calculate_ratio(tweets_DT)
biden_ratio = calculate_ratio(tweets_JB)
biden_ratio = biden_ratio.drop(index=[22,23],axis=0)
plot_ratio_graph(trump_ratio,biden_ratio)
plot_mean_graph(trump_ratio,biden_ratio)

### Pos/neg proportion graph for each candidate

In [None]:
tweets_DT[tweets_DT['Timestamp']<'2020-11-03'].predictions1_MLP.value_counts()

In [None]:
tweets_JB[tweets_JB['Timestamp']<'2020-11-03'].predictions1_MLP.value_counts()

In [None]:
before_elections_Trump = tweets_DT[tweets_DT['Timestamp']<'2020-11-03']
before_elections_Biden = tweets_JB[tweets_JB['Timestamp']<'2020-11-03']
dict = {
    'Candidate' : ['Trump','Biden'],
    'Positive' : [before_elections_Trump[before_elections_Trump['predictions1_MLP']==1].predictions1_MLP.count(),before_elections_Biden[before_elections_Biden['predictions1_MLP']==1].predictions1_MLP.count()],
    'Negative' : [before_elections_Trump[before_elections_Trump['predictions1_MLP']==-1].predictions1_MLP.count(),before_elections_Biden[before_elections_Biden['predictions1_MLP']==-1].predictions1_MLP.count()],
}
df_percentage = pd.DataFrame.from_dict(dict)
total = df_percentage['Positive'] + df_percentage['Negative']
df_percentage['Positive'] = df_percentage['Positive'] / total
df_percentage['Negative'] = df_percentage['Negative'] / total

x = df_percentage.plot(kind='bar',x='Candidate',stacked=True,figsize=(14, 7),color=['limegreen','crimson'])
plt.legend(ncol=2)
plt.xlabel("Candidate",fontsize=15)
plt.ylabel("Proportion",fontsize=15)
plt.title('Overall Sentiment Proportion',fontsize=20)
plt.gcf().autofmt_xdate()
def with_hue(plot, feature, Number_of_categories, hue_categories):
    a = [p.get_height() for p in plot.patches]
    patch = [p for p in plot.patches]
    for i in range(Number_of_categories):
        total = feature.value_counts().values[i]
        for j in range(hue_categories):
            percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
            x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
            y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height() 
            plot.annotate(percentage, (x, y), size = 15)
with_hue(x,df_percentage.Candidate,2,2)
plt.show()

In [None]:
df_percentage.head()

### Evaluation of Classifiers

In [None]:
# make text blob positive/negative
def textblob_converter(sentiment):
    if(sentiment>0):
        return int(1)
    elif(sentiment<0):
        return int(-1)
    else:
        return 0
df_evaluation_DT = pd.DataFrame()
df_evaluation_JB = pd.DataFrame()
df_evaluation_DT['textblob'] = tweets_DT['textblob'].apply(lambda x:textblob_converter(x))

In [None]:
# make all other classifiers
df_evaluation_DT['LR'] = tweets_DT['predictions1_LR']
df_evaluation_DT['MNB'] = tweets_DT['predictions1_MNB']
df_evaluation_DT['SVM'] = tweets_DT['predictions1_SVM']
df_evaluation_DT['MLP'] = tweets_DT['predictions1_MLP']

In [None]:
# instantiate model
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# convert bert predictions between -1 and 1 as required for this comparative analysis
bert_predictions_converter = {
    0:-1,
    1:-1,
    2:0,
    3:1,
    4:1
}
# make predictions one by one and pass through converter
df_evaluation_DT['bert'] = tweets_DT['tweet'].head(1000).apply(lambda x:bert_predictions_converter[int(torch.argmax((model(tokenizer.encode(x, return_tensors='pt'))).logits))])
# drop null values because bert did't make all predictions
df_evaluation_DT.dropna(inplace=True)
# drop columns with 0
df_evaluation_DT=df_evaluation_DT.loc[df_evaluation_DT['bert']!=0]

In [None]:
classifiers = ['LR','MNB','SVM','MLP']
ground_truth_classifier = 'textblob'
if (ground_truth_classifier=='textblob'):
    df_evaluation_DT=df_evaluation_DT.loc[df_evaluation_DT['textblob']!=0]
print('Accuracy')
for classifier in classifiers:
    print(classifier,': ',accuracy_score(df_evaluation_DT[ground_truth_classifier].values,df_evaluation_DT[classifier].values)*100,'%')

print('Precision')
for classifier in classifiers:
    print(classifier,': ',precision_score(df_evaluation_DT[ground_truth_classifier].values,df_evaluation_DT[classifier].values,pos_label=1)*100,'%')
    
print('F-1 Score')
for classifier in classifiers:
    print(classifier,': ',f1_score(df_evaluation_DT[ground_truth_classifier].values,df_evaluation_DT[classifier].values,pos_label=1)*100,'%')
  
print('Recall')
for classifier in classifiers:
    print(classifier,': ',recall_score(df_evaluation_DT[ground_truth_classifier].values,df_evaluation_DT[classifier].values,pos_label=1)*100,'%')
    
# Confusion Matrix
for classifier in classifiers:
    cm = confusion_matrix(df_evaluation_DT[ground_truth_classifier].values,df_evaluation_DT[classifier].values,labels = [1,-1])
    df_cm = pd.DataFrame(cm,columns=[1,-1],index = [1,-1])
    for i in df_cm:
        df_cm[i] = df_cm[i]/df_cm[i].sum()
    print(classifier,'Confusion Matrix:','\n',df_cm) 