In [172]:
# Load dependencies
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import VarianceThreshold,SelectFromModel,RFECV
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.ensemble import VotingClassifier

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import spacy
import en_core_web_sm
import sys
import unicodedata
import os

import warnings
warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
np.random.seed(7) # seeding random number generator

In [173]:
# File paths
pred_data_file = os.path.join("..","Resources","outputData","tweetCleanData.csv")

In [174]:
# Load Tweeter data file
df = pd.read_csv(pred_data_file)

In [175]:
# Display sample results
df.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet,Sentiment
0,8/10/2020,1292795662485130000,even right certain kind liberal deeply wants g...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...,Positive
1,8/10/2020,1292795661809850000,press people encouraged voters vote trump like...,Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ...",Positive
2,8/10/2020,1292795659704240000,trump signs executive order throw rotted scrap...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...,Negative
3,8/10/2020,1292795658747940000,sorry want real team truthful team justice tea...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...,Positive
4,8/10/2020,1292795658550810000,yeah sase cowers yelps befor jumping embarrass...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...,Negative


In [176]:
# Keep only required columns
df_pred = df[['Tweet','Sentiment']]

In [177]:
# Select meaningful reviews by filtering reviews with more than 5 words in the review comment.
df_pred = df_pred[df_pred['Tweet'].fillna("").apply(lambda x: len(x.split())>=5)] 

In [178]:
df_pred.head()

Unnamed: 0,Tweet,Sentiment
0,even right certain kind liberal deeply wants g...,Positive
1,press people encouraged voters vote trump like...,Positive
2,trump signs executive order throw rotted scrap...,Negative
3,sorry want real team truthful team justice tea...,Positive
4,yeah sase cowers yelps befor jumping embarrass...,Negative


In [184]:
# Calculate TD and IDF
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

        
def tokenText(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if use_idf:
        m = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenText)
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenText)
    
    d = m.fit_transform(data)
    return m, d

tf_m, tf_d = get_tf(df_pred['Tweet'], use_idf=False, max_df=0.90, min_df=10)
tfidf_m, tfidf_d = get_tf(df_pred['Tweet'], use_idf=True, max_df=0.90, min_df=10)

In [185]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(tfidf_d, df_pred['Sentiment'], test_size=0.3)

In [186]:
# Build linear regression model and fit 
cat = ["Positive","Negative","Neutral"]
def get_lr(x, y):
    models = []
    for c in cat:
        y_adj = np.array(y==c)
        lm = LogisticRegression()
        lm_f = lm.fit(x, y_adj)
        models.append(lm_f)
    return models

lr_m = get_lr(X_train, y_train)

In [187]:
# Function to test sentiment
def test_sentiment(text,model):
    test_str = [text]
    test_new = tfidf_m.transform(test_str)
    print(test_str[0])

    print('Tweet text: "{R}"\n'.format(R=test_str[0]))
    print('Model Predction')
    for m in range(0,3):
        print(model[m].predict_proba(test_new))
        print('Model ({M}): {P:.1%}'.format(M=cat[m], P=model[m].predict_proba(test_new)[0][1]))  

In [188]:
# Bad sentiment
test_sentiment('President Trump killed too many people because his COVID19 policies. He should have shut country in early stage.',lr_m)

President Trump killed too many people because his COVID19 policies. He should have shut country in early stage.
Tweet text: "President Trump killed too many people because his COVID19 policies. He should have shut country in early stage."

Model Predction
[[0.97295706 0.02704294]]
Model (Positive): 2.7%
[[0.09905755 0.90094245]]
Model (Negative): 90.1%
[[0.93536494 0.06463506]]
Model (Neutral): 6.5%


In [189]:
import pickle
model_file_name = os.path.join("..","Resources","model",'final_model.pickle')
tfidf_model_file_name = os.path.join("..","Resources","model",'tfidf_model.pickle')
tokenText_file_name = os.path.join("..","Resources","model",'tokenText.pickle')

In [190]:
# Save the model 
pickle.dump(lr_m, open(model_file_name, 'wb'))

In [191]:
# Save tfd
pickle.dump(tfidf_m, open(tfidf_model_file_name, "wb"), protocol=0)

In [192]:
# Save tokenText
pickle.dump(tokenText, open(tokenText_file_name, "wb"), protocol=0)

In [193]:
# Load saved model
loaded_model = pickle.load(open(model_file_name, 'rb'))
tfidf_model = pickle.load(open(tfidf_model_file_name, "rb"))
tokenText_model = pickle.load(open(tokenText_file_name, "rb"))

In [194]:
# Test string
x="President Trump kept his word on trade policies. He is great for the businesses"

In [195]:
# Transform test string
test_new = tfidf_model.transform([x])

In [196]:
print(f"Model (Positive): {loaded_model[0].predict_proba(test_new)[0][1]}")
print(f"Model (Negative): {loaded_model[1].predict_proba(test_new)[0][1]}")
print(f"Model (Neutral) {loaded_model[2].predict_proba(test_new)[0][1]}")

Model (Positive): 0.9124953315480058
Model (Negative): 0.02694912859750792
Model (Neutral) 0.0806113969106177
