In [1]:
# Author: Samriddha KC 
# Based on: https://www.kaggle.com/stoicstatic/twitter-sentiment-analysis-for-beginners

In [2]:
import re
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
from platform import python_version
print(python_version())

3.7.6


In [4]:
DATASET_COLUMNS=["sentiment","ids","date","flag","user","text"]
DATASET_ENCODING="ISO-8859-1"


dataset=pd.read_csv("/Users/samriddhakc/Desktop/training.1600000.processed.noemoticon.csv", encoding=DATASET_ENCODING,names=DATASET_COLUMNS, engine="python")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samriddhakc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dataset.columns

In [None]:
# Drop unecessary columns
# replace  4 with 1 for clarity
dataset=dataset[["sentiment","text"]]
dataset=dataset.replace(4,1)

In [None]:
# plot dataset to know the rough distribution of the data. 
ax=dataset.groupby("sentiment").count().plot(kind='bar',title="Data distribution")
ax.set_xticklabels(['Negative',"Positive"],rotation=0)
# Even distribution of data means less chance for bias. 

In [None]:
#store data as a list.
sentiments,texts=list(dataset['sentiment']),list(dataset['text'])

In [None]:
# Preprocessing Text. 
# LowerCasing->This erases confusion without the text losing any meaning 
# For clarity, replace words like http,https,or www by URL 
# Replacing emojis with words so that it can be used for feature extraction 
# Replace username with the word USER 
# Removing non-alphabets
# Remove repitive letter >=3 for reducing redudandency 
# Removing short words->Remove words with less than length 2 because they are mostly irrelevant
# Remove stop words:->Does not add much meaning to the whole sentence so it can be ignored. 
# Lemmantizing=>convert word to its base form to have a concise bag of words and prevent overfitting. 

emojis={':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']



In [None]:
def preprocess_sentiments(tweets): 
    processedData=[]
    wordLemm=WordNetLemmatizer()
    url_pattern=r"((https://)[^ ]*|(http://)[^ ]*|(www\.)[^ ]*)"
    user_pattern="@[^\s]+"
    alpha_pattern="[^a-zA-Z0-9]"
    sequence_pattern=r"(.)\1\1+"
    sequence_replace_pattern= r"\1\1"
    for tweet in tweets: 
        tweet=tweet.lower()
        tweet=re.sub(url_pattern,' URL',tweet)
        for emoji in emojis.keys(): 
            tweet=tweet.replace (emoji,"EMOJ"+emojis[emoji])
        tweet=re.sub(user_pattern,' USER',tweet)
        tweet=re.sub(alpha_pattern," ",tweet)
        tweet=re.sub(sequence_pattern,sequence_replace_pattern,tweet)
    
        tweet_words=""
        for word in tweet.split(' '): 
            if len(word)>1 and word not in stopwordlist: 
                word=wordLemm.lemmatize(word)
                tweet_words+=(word+' ')
        processedData.append(tweet_words)
    return processedData 

In [None]:
processed_tweets=preprocess_sentiments(texts)

In [None]:
processed_tweets

In [None]:
#Word Cloud for Negative Tweets
data_neg=processed_tweets[:800000]
plt.figure(figsize=(20,20))
wc=WordCloud(max_words=1000,width=1600,height=800,collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)

In [None]:
#Word Cloud for Positive Tweets
data_pos=processed_tweets[800000:]
plt.figure(figsize=(20,20))
wc=WordCloud(max_words=1000,width=1600,height=800,collocations=False).generate(" ".join(data_pos))
plt.imshow(wc)

In [None]:
#Splitting Data into training and test set. 
X_train,X_test,y_train,y_test=train_test_split(processed_tweets,sentiments,test_size=0.02,random_state=0)

In [None]:
X_train

In [None]:
vectorizer=TfidfVectorizer(ngram_range=(1,2),max_features=500000)
vectorizer.fit(X_train)
print("The feature names are",vectorizer.get_feature_names())
print("The feature length is",len(vectorizer.get_feature_names()))

In [None]:
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)
X_train.shape

In [None]:
#Evaluate the model 
def model_eval(model): 
    y_pred_train=model.predict(X_train)
    print("For train data",classification_report(y_train,y_pred_train))
    y_pred=model.predict(X_test)
    print("For test data",classification_report(y_test,y_pred))
    cf_matrix=confusion_matrix(y_test,y_pred)
    categories=['Negative','Positive']
    group_names=['True Neg','False Pos','False Neg','True Pos']
    group_percentages=['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels=np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix,annot=labels,cmap='Blues',fmt='',xticklabels=categories,yticklabels=categories)
    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
# Logistic Regression Model 
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(X_train, y_train)
model_eval(LRmodel)
y_pred_train=model.predict(X_train)
cf_matrix=confusion_matrix(y_test,y_pred)

In [None]:
#Test rrandom forest model with random tweets. 
random_texts=["I hate you","I love you","Corona virus will kill us","I think trump loves corona virus","I wanna the better situation for my exam, Corona virus is killing people","Corona virus is killing people, please don't let it for the calamity!","Be positive,ignore negativity, this too shall pass!"]
random_refined=preprocess_sentiments(random_texts)
X=vectorizer.transform(random_refined)
print(X.shape)
'''y_pred_train=LRmodel.predict(X)
y_pred=LRmodel.predict(X_test)
print("For test data",classification_report([1,0,0,1],y_pred))'''

In [None]:
y_pred_train=LRmodel.predict(X)
print("For test data",classification_report([1,0,0,1],y_pred_train))

In [None]:
y_pred_train

In [None]:
random_texts_1=["I hate you","I love you","Corona virus will kill us"]
random_refined_1=preprocess_sentiments(random_texts_1)
X1=vectorizer.transform(random_refined_1)
y_pred_train_2=LRmodel.predict(X1)

In [None]:
file = open('ngram.pickle','wb')
pickle.dump(vectorizer, file)
file.close()

file = open('LogisticTrainedLR.pickle','wb')
pickle.dump(LRmodel, file)
file.close()

In [None]:
y_pred_train_2

In [None]:
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(max_depth=100,random_state=0)
clf.fit(X_train,y_train)
model_eval(clf)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(max_depth=50,random_state=0)
clf.fit(X_train,y_train)
model_eval(clf)


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(max_depth=200,random_state=0)
clf.fit(X_train,y_train)
model_eval(clf)


In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
#Naive Bayes Model 