In [1]:
import numpy as np
import pandas as pd
import time

from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,LancasterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

from sklearn.metrics import accuracy_score

In [2]:
df_tweets=pd.read_csv(r'C:\Users\Omkar\Desktop\NLP\NLP\NLP\twitter_training.csv')

In [3]:
df_tweets

Unnamed: 0,No,class,Target,Review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
df1=df_tweets[0:40000]

In [5]:
df1

Unnamed: 0,No,class,Target,Review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
39995,1265,Battlefield,Neutral,German onslaught opened
39996,1266,Battlefield,Negative,Unfair respawn
39997,1266,Battlefield,Negative,Unfair Resurrection
39998,1266,Battlefield,Negative,An Unfair Respawn


In [6]:
x=df1['Review']
y=df1['Target']

In [7]:
x

0        im getting on borderlands and i will murder yo...
1        I am coming to the borders and I will kill you...
2        im getting on borderlands and i will kill you ...
3        im coming on borderlands and i will murder you...
4        im getting on borderlands 2 and i will murder ...
                               ...                        
39995                              German onslaught opened
39996                                    Unfair respawn   
39997                                  Unfair Resurrection
39998                                    An Unfair Respawn
39999                                         Unfair terms
Name: Review, Length: 40000, dtype: object

In [8]:
x.isna().sum()

364

In [9]:
x.mode()

0                                                     
1    At the same time, despite the fact that there ...
2    It is not the first time that the EU Commissio...
dtype: object

In [10]:
x.fillna(x.mode()[0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [11]:
x.isna().sum()

0

## 1) Tokenize

In [12]:
def tokenization(data):
    tokens=word_tokenize(data)
    return tokens

In [13]:
x_tokens=x.apply(tokenization)
x_tokens

0        [im, getting, on, borderlands, and, i, will, m...
1        [I, am, coming, to, the, borders, and, I, will...
2        [im, getting, on, borderlands, and, i, will, k...
3        [im, coming, on, borderlands, and, i, will, mu...
4        [im, getting, on, borderlands, 2, and, i, will...
                               ...                        
39995                          [German, onslaught, opened]
39996                                    [Unfair, respawn]
39997                               [Unfair, Resurrection]
39998                                [An, Unfair, Respawn]
39999                                      [Unfair, terms]
Name: Review, Length: 40000, dtype: object

# 2) Cleaning

In [14]:
def cleaning(data):
    clean_text=[i for i in data if i not in punctuation]
    return clean_text

In [15]:
x_clean=x_tokens.apply(cleaning)
x_clean

0        [im, getting, on, borderlands, and, i, will, m...
1        [I, am, coming, to, the, borders, and, I, will...
2        [im, getting, on, borderlands, and, i, will, k...
3        [im, coming, on, borderlands, and, i, will, mu...
4        [im, getting, on, borderlands, 2, and, i, will...
                               ...                        
39995                          [German, onslaught, opened]
39996                                    [Unfair, respawn]
39997                               [Unfair, Resurrection]
39998                                [An, Unfair, Respawn]
39999                                      [Unfair, terms]
Name: Review, Length: 40000, dtype: object

## 3) Normalization

In [16]:
def normalize(data):
    normal_text=[i.lower() for i in data]
    return normal_text

In [17]:
x_normal=x_clean.apply(normalize)
x_normal

0        [im, getting, on, borderlands, and, i, will, m...
1        [i, am, coming, to, the, borders, and, i, will...
2        [im, getting, on, borderlands, and, i, will, k...
3        [im, coming, on, borderlands, and, i, will, mu...
4        [im, getting, on, borderlands, 2, and, i, will...
                               ...                        
39995                          [german, onslaught, opened]
39996                                    [unfair, respawn]
39997                               [unfair, resurrection]
39998                                [an, unfair, respawn]
39999                                      [unfair, terms]
Name: Review, Length: 40000, dtype: object

# 4) Stop Words Removal

In [18]:
stop=stopwords.words('english')
def stop_removal(data):
    stop_text=[i for i in data if i not in stop]
    return stop_text

In [19]:
x_no_stops=x_normal.apply(stop_removal)
x_no_stops

0           [im, getting, borderlands, murder]
1                      [coming, borders, kill]
2             [im, getting, borderlands, kill]
3            [im, coming, borderlands, murder]
4        [im, getting, borderlands, 2, murder]
                         ...                  
39995              [german, onslaught, opened]
39996                        [unfair, respawn]
39997                   [unfair, resurrection]
39998                        [unfair, respawn]
39999                          [unfair, terms]
Name: Review, Length: 40000, dtype: object

# 5) Stemming and Lemmatization

In [20]:
lemma=WordNetLemmatizer()
def lemmatization(data):
    l1=[]
    for i in data:
        word=lemma.lemmatize(i)
        l1.append(word)
    return l1

In [21]:
x_lemma=x_no_stops.apply(lemmatization)
x_lemma

0           [im, getting, borderland, murder]
1                      [coming, border, kill]
2             [im, getting, borderland, kill]
3            [im, coming, borderland, murder]
4        [im, getting, borderland, 2, murder]
                         ...                 
39995             [german, onslaught, opened]
39996                       [unfair, respawn]
39997                  [unfair, resurrection]
39998                       [unfair, respawn]
39999                          [unfair, term]
Name: Review, Length: 40000, dtype: object

# Join list

In [22]:
def string(data):
    strings=' '.join(data)
    return strings

In [23]:
x_final=x_lemma.apply(string)
x_final

0          im getting borderland murder
1                    coming border kill
2            im getting borderland kill
3           im coming borderland murder
4        im getting borderland 2 murder
                      ...              
39995           german onslaught opened
39996                    unfair respawn
39997               unfair resurrection
39998                    unfair respawn
39999                       unfair term
Name: Review, Length: 40000, dtype: object

# CountVectorizer

In [24]:
cv=CountVectorizer(lowercase=True,stop_words='english',max_df=0.95,max_features=2000)
cv_train=cv.fit_transform(x_final)

In [25]:
cv_train

<40000x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 277310 stored elements in Compressed Sparse Row format>

In [26]:
cv_train.A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
print(cv.get_feature_names())





In [28]:
len(cv.get_feature_names())

2000

In [29]:
print(cv.get_stop_words())

frozenset({'became', 'about', 'you', 're', 'himself', 'down', 'up', 'keep', 'before', 'yours', 'someone', 'yourselves', 'cant', 'nor', 'how', 'also', 'mine', 'some', 'in', 'thence', 'have', 'is', 'whereas', 'seemed', 'not', 'interest', 'out', 'its', 'whoever', 'ten', 'your', 'former', 'show', 'either', 'will', 'part', 'beforehand', 'once', 'seeming', 'un', 'something', 'sometimes', 'thru', 'it', 'toward', 'somehow', 'because', 'become', 'top', 'through', 'herein', 'myself', 'enough', 'beyond', 'hence', 'yourself', 'her', 'ltd', 'him', 'per', 'six', 'sixty', 'always', 'we', 'everywhere', 'with', 'them', 'a', 'everything', 'further', 'made', 'get', 'or', 'whenever', 'am', 'noone', 'rather', 'whence', 'first', 'front', 'side', 'throughout', 'whose', 'system', 'otherwise', 'being', 'back', 'well', 'their', 'eleven', 'empty', 'amount', 'towards', 'beside', 'bottom', 'thereafter', 'ourselves', 'inc', 'thick', 'on', 'fifteen', 'none', 'any', 'amongst', 'many', 'therein', 'both', 'at', 'become

In [30]:
df_train=pd.DataFrame(cv_train.A,columns=cv.get_feature_names())
df_train.head(10)

Unnamed: 0,00,000,01,02,03,05,06,07,08,09,...,youtube,youtubegaming,yu,yup,zero,zombie,zone,zonestream,zoom,яй
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df_train.shape

(40000, 2000)

In [32]:
encoder=LabelEncoder()
y=encoder.fit_transform(y)
y

array([3, 3, 3, ..., 1, 1, 1])

In [33]:
encoder.classes_

array(['Irrelevant', 'Negative', 'Neutral', 'Positive'], dtype=object)

In [34]:
x_train,x_test,y_train,y_test=train_test_split(df_train,y,stratify=y,random_state=13,test_size=0.25)

## Multinomial Naive Bayes

In [35]:
%%time
mnb_model=MultinomialNB()
mnb_model.fit(x_train,y_train)

Wall time: 3.23 s


In [36]:
%%time
y_pred_test=mnb_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.619
Wall time: 168 ms


In [37]:
%%time
y_pred_train=mnb_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.6491
Wall time: 677 ms


In [38]:
print('Variance is ',(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  3.0100000000000016
Bias is  30.089999999999996


# SVM

In [39]:
%%time
svc_model=SVC()
svc_model.fit(x_train,y_train)

Wall time: 45min 11s


In [40]:
%%time
y_pred_test=svc_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.8114
Wall time: 8min 37s


In [41]:
%%time
y_pred_train=svc_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.8604
Wall time: 25min 3s


In [42]:
print('Variance is ',(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  4.900000000000004
Bias is  8.95999999999999


# TfidfVectorizer

In [43]:
tf=TfidfVectorizer(lowercase=True,max_df=0.95,stop_words='english',max_features=1000)
tf_train=tf.fit_transform(x_final)

In [44]:
tf_train.A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
print(tf.get_feature_names())

['000', '10', '100', '11', '12', '13', '14', '15', '16', '17', '19', '20', '2018', '2019', '2020', '25', '28', '2k', '2ksupport', '30', '360', '365', '50', '500', '60', '70', 'able', 'absolute', 'absolutely', 'ac', 'access', 'account', 'achievement', 'action', 'actual', 'actually', 'ad', 'add', 'added', 'af', 'age', 'ago', 'ai', 'amazing', 'amazon', 'american', 'amzn', 'android', 'angry', 'announcement', 'annoying', 'answer', 'anti', 'anymore', 'apex', 'app', 'apparently', 'apple', 'appreciate', 'apps', 'art', 'article', 'artist', 'ask', 'assassin', 'assassinscreed', 'attack', 'audio', 'available', 'away', 'awesome', 'baby', 'bad', 'ball', 'ban', 'banned', 'based', 'basic', 'basically', 'battle', 'battlefield', 'bc', 'beat', 'beautiful', 'believe', 'beluba', 'best', 'beta', 'better', 'bf4db', 'big', 'biggest', 'birthday', 'bit', 'bitch', 'black', 'blizzard', 'blizzard_ent', 'blog', 'bloody', 'blue', 'book', 'border', 'borderland', 'boring', 'bot', 'bought', 'box', 'boy', 'brain', 'bran



In [46]:
print(tf.get_stop_words())

frozenset({'became', 'about', 'you', 're', 'himself', 'down', 'up', 'keep', 'before', 'yours', 'someone', 'yourselves', 'cant', 'nor', 'how', 'also', 'mine', 'some', 'in', 'thence', 'have', 'is', 'whereas', 'seemed', 'not', 'interest', 'out', 'its', 'whoever', 'ten', 'your', 'former', 'show', 'either', 'will', 'part', 'beforehand', 'once', 'seeming', 'un', 'something', 'sometimes', 'thru', 'it', 'toward', 'somehow', 'because', 'become', 'top', 'through', 'herein', 'myself', 'enough', 'beyond', 'hence', 'yourself', 'her', 'ltd', 'him', 'per', 'six', 'sixty', 'always', 'we', 'everywhere', 'with', 'them', 'a', 'everything', 'further', 'made', 'get', 'or', 'whenever', 'am', 'noone', 'rather', 'whence', 'first', 'front', 'side', 'throughout', 'whose', 'system', 'otherwise', 'being', 'back', 'well', 'their', 'eleven', 'empty', 'amount', 'towards', 'beside', 'bottom', 'thereafter', 'ourselves', 'inc', 'thick', 'on', 'fifteen', 'none', 'any', 'amongst', 'many', 'therein', 'both', 'at', 'become

In [47]:
df_tf_train=pd.DataFrame(tf_train.A,columns=tf.get_feature_names())
df_tf_train

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,yes,yesterday,yo,young,youtu,youtube,yu,zero,zombie,яй
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
x_train,x_test,y_train,y_test=train_test_split(df_tf_train,y,stratify=y,random_state=13,test_size=0.25)

## Multunomial Naive Bayes

In [49]:
%%time
mnb_model=MultinomialNB()
mnb_model.fit(x_train,y_train)

Wall time: 134 ms


In [50]:
%%time
y_pred_test=mnb_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.5886
Wall time: 52.4 ms


In [51]:
%%time
y_pred_train=mnb_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.6015333333333334
Wall time: 126 ms


In [52]:
print('Variance is ',(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  1.2933333333333352
Bias is  34.84666666666666


## SVM

In [53]:
%%time
svc_model=SVC()
svc_model.fit(x_train,y_train)

Wall time: 26min 23s


In [54]:
%%time
y_pred_test=svc_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.8467
Wall time: 3min 19s


In [55]:
%%time
y_pred_train=svc_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.9058
Wall time: 9min 21s


In [56]:
print('Variance is ',abs(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  5.910000000000004
Bias is  4.419999999999991


## XGBoost

In [57]:
%%time
xgb_model=xgboost.XGBClassifier()
xgb_model.fit(x_train,y_train)

Wall time: 1min 48s


In [58]:
%%time
y_pred_test=xgb_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.672
Wall time: 185 ms


In [59]:
%%time
y_pred_train=xgb_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.7476666666666667
Wall time: 454 ms


In [60]:
print('Variance is ',(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  7.566666666666666
Bias is  20.233333333333327


## Random Forest Classifier

In [61]:
%%time
rfc_model=RandomForestClassifier()
rfc_model.fit(x_train,y_train)

Wall time: 59.3 s


In [62]:
%%time
y_pred_test=rfc_model.predict(x_test)
test_acc=accuracy_score(y_test,y_pred_test)
print('Test accuracy is ',test_acc)

Test accuracy is  0.8433
Wall time: 800 ms


In [63]:
%%time
y_pred_train=rfc_model.predict(x_train)
train_acc=accuracy_score(y_train,y_pred_train)
print('Train accuracy is ',train_acc)

Train accuracy is  0.9283333333333333
Wall time: 2.13 s


In [64]:
print('Variance is ',(train_acc-test_acc)*100)
print('Bias is ',(0.95-train_acc)*100)

Variance is  8.50333333333333
Bias is  2.166666666666661


#### Finding the labels for classes

In [66]:
encoder1=LabelEncoder()
label=df1['Target']
encoded_label=encoder.fit_transform(df1['Target'])
param=pd.DataFrame({'label':label,'encoded_label':encoded_label})
param.groupby('label').first().reset_index()

Unnamed: 0,label,encoded_label
0,Irrelevant,0
1,Negative,1
2,Neutral,2
3,Positive,3


## Storing the important models as pickle file

In [69]:
import pickle as pkl

with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\svm_model.pkl','wb') as file:
    pkl.dump(svc_model,file)
    

In [70]:
with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\xgb_model.pkl','wb') as file:
    pkl.dump(xgb_model,file)
    
with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\rfc_model.pkl','wb') as file:
    pkl.dump(rfc_model,file)
    
with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\mnb_model.pkl','wb') as file:
    pkl.dump(mnb_model,file)

In [71]:
with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\tfidf.pkl','wb') as file:
    pkl.dump(tf,file)
    
with open(r'C:\Users\Omkar\Desktop\Practice\Datasets\Twitter Sentiment Analysis\Artifacts\cv.pkl','wb') as file:
    pkl.dump(cv,file)