### Tweeter Sentiment Prediction Modelling

In [None]:
#! pip install spacy==2.2

In [None]:
#pip list

In [1]:
# Load dependencies
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import VarianceThreshold,SelectFromModel,RFECV
from sklearn.decomposition import LatentDirichletAllocation,TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,ExtraTreesRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.ensemble import VotingClassifier

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from model import tokenize

import spacy
import en_core_web_sm
import sys
import unicodedata
import os

import warnings
warnings.filterwarnings('ignore') # Hides warning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
np.random.seed(7) # seeding random number generator

In [2]:
# File paths
pred_data_file = os.path.join("..","Resources","outputData","tweetCleanData.csv")

In [3]:
# Load Tweeter data file
df = pd.read_csv(pred_data_file)

In [4]:
# Display sample results
df.head()

Unnamed: 0,Date,TweetID,Tweet,Matched Keywords,User,Source,Followers,Friends,Favorite,OrgTweet,Sentiment
0,8/10/2020,1292795662485130000,even right certain kind liberal deeply wants g...,Trump,MenshevikM,Twitter Web App,5882.0,320.0,0.0,They're even right that there's a certain kind...,Positive
1,8/10/2020,1292795661809850000,press people encouraged voters vote trump like...,Trump,balling_it,Twitter Web App,33.0,156.0,0.0,"@jonathanchait Naw, that is the press, people ...",Positive
2,8/10/2020,1292795659704240000,trump signs executive order throw rotted scrap...,Trump,laurie71,Twitter for iPhone,85.0,141.0,0.0,@sarahcpr Trump signs an executive order to th...,Negative
3,8/10/2020,1292795658747940000,sorry want real team truthful team justice tea...,Biden,bluewave4peace,Twitter for iPhone,528.0,745.0,0.0,@glennkirschner2 Sorry I want to be on a real ...,Positive
4,8/10/2020,1292795658550810000,yeah sase cowers yelps befor jumping embarrass...,Trump,OGOPer,Twitter for iPhone,2839.0,2655.0,0.0,Yeah. Until Ben Sasse cowers and yelps befor j...,Negative


In [5]:
# Describe data set
df.describe()

Unnamed: 0,TweetID,Followers,Friends,Favorite
count,110146.0,110100.0,110100.0,110100.0
mean,1.29193e+18,27478.15,2887.499,6.911308
std,2.643205e+16,576621.7,11948.21,138.15446
min,0.0,0.0,0.0,0.0
25%,1.29131e+18,70.0,175.0,0.0
50%,1.292466e+18,388.0,611.0,0.0
75%,1.293694e+18,2021.0,2051.0,1.0
max,1.294221e+18,58403720.0,1166020.0,16532.0


In [6]:
# Keep only required columns
df_pred = df[['OrgTweet','Tweet','Sentiment']]

In [7]:
# Check df stats
df_pred.describe()

Unnamed: 0,OrgTweet,Tweet,Sentiment
count,110100,110097,110146
unique,107431,96701,3
top,@varindersingh24 Trump,trump,Negative
freq,26,931,43988


In [8]:
# Check data dimension - this step is required to make sure we have enough data points to train and test model
df_pred.shape

(110146, 3)

In [9]:
# Select meaningful reviews by filtering reviews with more than 5 words in the review comment.
df_pred = df_pred[df_pred['Tweet'].fillna("").apply(lambda x: len(x.split())>=5)] 

In [10]:
# Check the data dimension
df_pred.shape

(94051, 3)

In [11]:
# Lemmatization and Stop words
nlp = en_core_web_sm.load()

# Define default stopwords list
stoplist = spacy.lang.en.STOP_WORDS 

# Functions to remove encodes
def replace_ptbr_char_by_word(word):
    #  """ Will remove the encode token by token"""
    word = str(word)
    word = unicodedata.normalize('NFKD', word).encode('ASCII','ignore').decode('ASCII')
    return word

def remove_pt_br_char_by_text(text):
    #  """ Will remove the encode using the entire text"""
    text = str(text)
    text = " ".join(replace_ptbr_char_by_word(word) for word in text.split() if word not in stoplist)
    return text

In [12]:
# Lemmatization and Stop words
df_pred['predTweet'] = df_pred['Tweet'].apply(remove_pt_br_char_by_text)

In [13]:
# Keep only required columns
df_pred=df_pred[['predTweet','Sentiment']]

In [14]:
# Check data types
df_pred.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94051 entries, 0 to 110145
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   predTweet  94051 non-null  object
 1   Sentiment  94051 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [15]:
# Convert review_body to text
df_pred['predTweet'] = df_pred['predTweet'].astype(str)

In [16]:
# Display df sample
df_pred.head()

Unnamed: 0,predTweet,Sentiment
0,right certain kind liberal deeply wants group ...,Positive
1,press people encouraged voters vote trump like...,Positive
2,trump signs executive order throw rotted scrap...,Negative
3,sorry want real team truthful team justice tea...,Positive
4,yeah sase cowers yelps befor jumping embarrass...,Negative


### Calculate Term Frequencies
- Calculate both the actual term frequency as well as the tfidf weighted term frequency. Let's limit words occuring in at most 90% of documents and in at least 10 documents. 
- The term-frequency matrix is just a word count, the IDF calculation adjusts for "boring" or "irrelvant" words that occur in many reviews.

- Perform two tokenizing operations. First, tokenize only letters, ignoring special symbols & numbers. 
- Use the NLTK Snowball stemmer to try and get the root of a word as best as possible. Stop words are removed in the vectorization step.

In [37]:
def funfunction():
    pass

funfunction

<function __main__.funfunction()>

In [19]:
# Calculate TD and IDF
# stemmer = SnowballStemmer("english")
# tokenizer = RegexpTokenizer("[a-z']+")

# def tokenize(text):
#     tokens = tokenizer.tokenize(text)
#     return [stemmer.stem(t) for t in tokens] 



def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if use_idf:
        m = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    
    d = m.fit_transform(data)
    return m, d

tf_m, tf_d = get_tf(df_pred['predTweet'], use_idf=False, max_df=0.90, min_df=10)
tfidf_m, tfidf_d = get_tf(df_pred['predTweet'], use_idf=True, max_df=0.90, min_df=10)

### Compute topics using Kmeans and LDA

- Kmeans approach: Using our TFIDF matrix, cluster documents into N clusters based on their TFIDF similarity. Within each cluster, count the top occuring terms.

- LDA approach: Using TF matrix, attempt to extact N topics from the collection of documents.

##### Note: Kmeans forces each review to belong to only one cluster while LDA allows a review to have many topics associated with it. 

In [None]:
# Let's choose 15 topics
n_topics = 10

# LDA Approach
def get_lda(data, topics):
    m = LatentDirichletAllocation(n_components=topics, n_jobs=-1, learning_method='online').fit(data)
    d = m.transform(data)
    return m, d

# Kmeans Approach
def get_kmeans(data, k, scale=True):
    if scale:
        s = MinMaxScaler()
        data = s.fit_transform(data)
    
    m = KMeans(n_clusters=k).fit(data)
    d = m.predict(data)
    return m, d        

lda_m, lda_d = get_lda(tf_d, n_topics)
kmean_m, kmean_d = get_kmeans(tfidf_d, n_topics, scale=False)

### Show cluster top 10 words per topic
- First extract the top 10 stemmed words per topic in our LDA model. 
- Repeate process for kmeans clustered documents. 

##### Note: Here we just count the top 15 most frequent stemmed words per cluster. Both show similar sets of results.

In [None]:
# Show topics
def show_topics(model, feature_names, n_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_words - 1:-1]]))
    print()
    
# Show Clusters    
def show_cluster_topics(cluster_labels, tf_matrix, feature_names, n_words):
    d = pd.DataFrame(tf_matrix.toarray())
    d['c'] = cluster_labels
    d = d.groupby('c').sum().T
    
    for col in d:
        top_n = d[col].nlargest(n_words).index.tolist()
        print("Cluster #%d:" % col)
        print(", ".join([feature_names[i]
                for i in top_n]))
    print()

In [None]:
# LDA topic 
print("Top 10 stemmed words per topic in LDA model\n")
show_topics(lda_m, tf_m.get_feature_names(), 10)

In [None]:
#Kmean cluster of words
print("Top 10 stemmed words per cluster in Kmeans model\n")
show_cluster_topics(kmean_d, tfidf_d, tfidf_m.get_feature_names(),20)

## Prepare data for plotting
- TF/TFIDF matricies is a challenge to graphically represent documents as charts are limited to 3 dimensions. 
- Perform a heirarchical clustering, but the number of documents makes this approach very slow. 
- Let's perform a SVD/LSA to reduce the dimensionality of the matrix to something more manageable (eg. 30 dimensions).

In [None]:
def get_svd(data, components):
    svd = TruncatedSVD(n_components=components).fit(data)
    o = pd.DataFrame(svd.transform(data), columns=range(0,components))
    return svd,o

def get_tsne(data, components, perplexity):
    tsne = TSNE(n_components=components, perplexity=perplexity, n_iter=1000)
    o = pd.DataFrame(tsne.fit_transform(data), columns=range(0,components))
    return tsne,o

svd_v, svd_m = get_svd(tfidf_d, 30)
tnse_v, tsne_m = get_tsne(svd_m, 2, 25)

lda_c = lda_d.argmax(axis=1)

### Plot Data
- Use LDA and Kmeans labels with reduced dimensions to plot our documents. 
- Create a rainbow color scheme which allows for a variable number of topics/clusters. 
- The plot tends to overlap quite a bit.  

In [None]:
# Cluster topc plot
def plot_scatter_2d(x, y, c, sample_size, title):
    df = pd.DataFrame({'x': x, 'y': y, 'c': c}).sample(sample_size)
    l = len(np.unique(c))
    
    ax = plt.subplot(111)
    colors = cm.rainbow(np.linspace(0, 1, l))
                                   
    for c in range(0,l):
        qq = df[df['c']==c]
        ax.scatter(qq['x'], qq['y'], label=c)
    plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0), title='Topic/Cluster')
    ax.set_yticklabels([])
    ax.set_xticklabels([])
    ax.set_title(title)
    plt.show()

# Plot
plot_scatter_2d(tsne_m[0], tsne_m[1], kmean_d, 1000, 'KMeans Clustering of Tweeter Sentiment using TFIDF (t-SNE Plot)')

In [None]:
# LDA topics plot
plot_scatter_2d(tsne_m[0], tsne_m[1], lda_c, 1000, 'LDA Topics of Tweeter Sentiment using TF (t-SNE Plot)')

### Prepare data for prediction
- Scatter plots helps to understand how the data is structured, but it doesn't tell what drives positive or negative reviews. - - Lets use words within reviews to build a predictive scoring model.
- When training models, split the data 70%/30% where 30% will be used for prediction to gauge final accuracy of the model.
- For each review, the model that scores the highest will tell us which kind of review it likely is.

In [20]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(tfidf_d, df_pred['Sentiment'], test_size=0.3)

### Calculate model accuracies
- Three different approaches to build the review predictions: Logistic Regression, Naive Bayes, and Support Vector Machines. 
- Final approach that does a combined "vote" of all three models. 
- Since we have limited data, lets use cross-validation to split the data 10 ways and measure accuracy in an unbiased way.

In [None]:
# Calculate cross validation score
cat = ["Positive","Negative","Neutral"]
def calculate_cv(X, y):
    results = {
        'lr': [],
        'svm': [],
        'nb': [],
        'combined': []
    }
    lm = LogisticRegression()
    svm = LinearSVC()
    nb = MultinomialNB()
    vc = VotingClassifier([('lm', lm), ('svm', svm), ('nb', nb)])
    
    for c in cat:
        y_adj = np.array(y==c)
        results['lr'].append((cross_val_score(lm, X, y_adj, cv=10, scoring='accuracy').mean(), c))
        results['svm'].append((cross_val_score(svm, X, y_adj, cv=10, scoring='accuracy').mean(), c))
        results['nb'].append((cross_val_score(nb, X, y_adj, cv=10, scoring='accuracy').mean(), c))
        results['combined'].append((cross_val_score(vc, X, y_adj, cv=10, scoring='accuracy').mean(), c))
    return results

# Call function
cv_scores = calculate_cv(X_test, y_test)

# print model accuracy predictions

print("Model accuracy predictions\n")
for m,s in cv_scores.items():
    for ss in s:
        print("{M} model ({R} rating): {S:.1%}".format(M=m.upper(), R=ss[1], S=ss[0]))
    print()

### Training the model of choice
- SVM have given better performance for. There is definitely room for improvement.  

- There are lots of ways to tweak the prior steps to get a better result.

    - Tweak any parameters in either the TF step or the modeling step
    - Neg/Pos keywords might vary by topic so we might do this for one cluster at a time
    - Nouns don't provide much insight and we are better off removing them
    - "good" and "not good" have opposite meanings so maybe we should have included 2-grams

In [22]:
cat = ["Positive","Negative","Neutral"]

In [23]:
# Build linear regression model and fit 
def get_lr(x, y):
    models = []
    for c in cat:
        y_adj = np.array(y==c)
        lm = LogisticRegression()
        lm_f = lm.fit(x, y_adj)
        models.append(lm_f)
    return models

lr_m = get_lr(X_train, y_train)

### Plot the results
- Charts show the result of Logistic Regression model. 
- Top 10 words negatively associated (red) with that review model, and the top 10 words positively associated (green) with that review model. 
- The values indiciate how much more likely or unlikely a review is to be low, neutral, high given the # of times that word occurs in the review.

In [None]:
# Plot coefficient 

def plot_coef(title, model, feature_names, n_words):
    v = []
    for topic_idx, topic in enumerate(model.coef_):
        [v.append([feature_names[i], model.coef_.item(i)]) for i in topic.argsort()[:-n_words - 1:-1]]
        [v.append([feature_names[i], model.coef_.item(i)]) for i in topic.argsort()[0:n_words]]
    
    df = pd.DataFrame(v, columns=['Term','Coefficient']).sort_values(by='Coefficient',ascending=False)
    
    df['c'] = df['Coefficient']>0
    
    ax = df.plot(x='Term', y='Coefficient', kind='barh', color=df['c'].map({True: 'g', False: 'r'}), grid=True, legend=False,
           title=title)
    
    ax.set_xlabel("Coefficient")

n_terms = 10
for c in range(0,len(cat)):
    plot_coef('Top {N} words in ({R}) Tweet sentiment model\nGreen = Associated | Red = Not Associated'.format(N=n_terms*2, 
               R=cat[c]), lr_m[c], tfidf_m.get_feature_names(), n_terms)

### Test output
- Below is a test function which allows to supply review to see how well the model will predict it's rating. 
- For simplicity, logistic regression model is used and only allow for one review at a time.
- The program uses the stored TFIDF matrix to tokenize and transform our new review which is then fed to all three of logistic regression models. 
- Each model has an independent assessment of how likely it is that our review is a positive hit.  

In [24]:
# Function to test sentiment
def test_sentiment(text,model):
    test_str = [text]
    test_new = tfidf_m.transform(test_str)

    print('Tweet text: "{R}"\n'.format(R=test_str[0]))
    print('Model Predction')
    for m in range(0,3):
        print('Model ({M}): {P:.1%}'.format(M=cat[m], P=model[m].predict_proba(test_new)[0][1]))       

In [25]:
# Bad sentiment
test_sentiment('President Trump killed too many people because his COVID19 policies. He should have shut country in early stage.',lr_m)

Tweet text: "President Trump killed too many people because his COVID19 policies. He should have shut country in early stage."

Model Predction
Model (Positive): 3.8%
Model (Negative): 87.0%
Model (Neutral): 6.7%


In [26]:
# Good sentiment
test_sentiment('President Trump kept his word on trade policies. He is great for the businesses.',lr_m)

Tweet text: "President Trump kept his word on trade policies. He is great for the businesses."

Model Predction
Model (Positive): 90.2%
Model (Negative): 3.6%
Model (Neutral): 7.0%


### Save Model

In [27]:
import pickle
model_file_name = os.path.join("..","Resources","model",'final_model.pickle')
tfidf_model_file_name = os.path.join("..","Resources","model",'tfidf_model.pickle')

In [28]:
# Save the model 
pickle.dump(lr_m, open(model_file_name, 'wb'))

In [29]:
# Save tfd
pickle.dump(tfidf_m, open(tfidf_model_file_name, "wb"), protocol=0)

### Load Model

In [30]:
# Load saved model
loaded_model = pickle.load(open(model_file_name, 'rb'))
tfidf_model = pickle.load(open(tfidf_model_file_name, "rb"))

In [31]:
# Test string
x="President Trump kept his word on trade policies. He is great for the businesses"

In [32]:
# Transform test string
test_new = tfidf_model.transform([x])

In [33]:
print(f"Model (Positive): {loaded_model[0].predict_proba(test_new)[0][1]}")
print(f"Model (Negative): {loaded_model[1].predict_proba(test_new)[0][1]}")
print(f"Model (Neutral) {loaded_model[2].predict_proba(test_new)[0][1]}")

Model (Positive): 0.9015265715910362
Model (Negative): 0.03611988543049171
Model (Neutral) 0.06964708736590186


### Check other Models for accurancy

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=5)))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))