In [24]:
import numpy as np
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from collections import Counter
import os
os.chdir("C:/Digital Nest/NLP")

In [25]:
MR = pd.read_csv("MovieReviews.tsv",sep='\t')
print(MR.shape)
print(MR.columns.values)
print(MR.head())

(156060, 4)
['PhraseId' 'SentenceId' 'Phrase' 'Sentiment']
   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  


In [26]:
Counter(MR.Sentiment)

Counter({1: 27273, 2: 79582, 3: 32927, 4: 9206, 0: 7072})

In [27]:
len(Counter(MR.SentenceId))

8529

In [28]:
len(Counter(MR.PhraseId))

156060

In [6]:
MR[MR.SentenceId==1]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [29]:
# remove punctuation marks
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
Phrase2 = list()
for phrase in MR.Phrase:
    phrasewords = tokenizer.tokenize(phrase)
    Phrase2.append(' '.join(phrasewords))
print(len(Phrase2))

156060


In [30]:
print(MR.Phrase[0])
print(Phrase2[0])

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .
A series of escapades demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amounts to much of a story


In [31]:
# remove numbers
Phrase3 = list()
for string in Phrase2:
    translation = string.maketrans("","","0123456789")
    string = string.translate(translation)
    Phrase3.append(string)
print(len(Phrase3))

156060


In [32]:
# convert into lowercase
Phrase4 = list()
for string in Phrase3:
    string = string.lower()
    Phrase4.append(string)
print(len(Phrase4))

156060


In [34]:
X = pd.DataFrame({"phrase": Phrase4})
print(X.shape)
Y = pd.DataFrame(MR,columns=['Sentiment'])
print(Y.shape)

(156060, 1)
(156060, 1)


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X, Y,
                                                   test_size = 0.2,
                                                   random_state = 0)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(124848, 1)
(124848, 1)
(31212, 1)
(31212, 1)


In [51]:
# create TFID vector
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase = True, analyzer = 'word', stop_words = 'english',
                             max_df=100, min_df=1,max_features=500,
                             binary=True)
X_train2 = vectorizer.fit_transform(X_train.phrase)
X_test2 = vectorizer.transform(X_test.phrase)
print(X_train2.shape)
print(X_test2.shape)
print(vectorizer.get_feature_names())

(124848, 500)
(31212, 500)
['absorbing', 'add', 'admirable', 'adolescent', 'adult', 'affair', 'affection', 'affirming', 'aged', 'ages', 'alive', 'allen', 'allows', 'americans', 'angst', 'animal', 'anne', 'annoying', 'answers', 'anti', 'apart', 'apparent', 'appears', 'artificial', 'ask', 'aspects', 'atmosphere', 'attraction', 'avoid', 'balance', 'band', 'basic', 'bears', 'beat', 'begins', 'behavior', 'believable', 'belly', 'bits', 'blade', 'blair', 'blue', 'body', 'bond', 'bore', 'bright', 'brings', 'broken', 'brought', 'brown', 'bullock', 'bunch', 'camp', 'captures', 'casting', 'cause', 'challenging', 'chance', 'changing', 'channel', 'chase', 'check', 'chemistry', 'child', 'chilling', 'chinese', 'christmas', 'cinematography', 'clarity', 'clearly', 'cliched', 'comedic', 'commentary', 'commercial', 'common', 'community', 'complete', 'complicated', 'concerned', 'condition', 'conflict', 'connect', 'conscious', 'constructed', 'conventional', 'conviction', 'courage', 'course', 'creates', 'cr

In [42]:
from sklearn.linear_model import LogisticRegression
LGR = LogisticRegression(penalty='none',solver='lbfgs')
LGR.fit(X_train2,Y_train)
print(LGR.coef_)
print(LGR.intercept_)

  y = column_or_1d(y, warn=True)


[[-7.17168371e-01  6.61865728e-02  5.86176378e-01 -5.86335740e-01
   9.39340629e-01 -2.57174301e-01  2.68084936e-01  1.04195623e+00
   1.20278078e+00 -1.26042145e+00 -4.10755395e-01  8.24739752e-01
   4.01973558e-01  1.56598226e+00 -1.14392446e+00 -3.13619252e-01
  -1.82016945e-01 -7.86288162e-01  4.69633532e-01  1.47648420e+00
   5.27499161e-02  3.21532984e-01 -1.58315660e-01 -1.29582943e+00
  -3.76697291e-01  7.08273025e-01  3.41671403e-01  1.76085825e+00
   1.18858121e+00 -8.35690873e-01 -8.41384050e-02 -8.60860339e-02
   1.83585597e-01 -1.48403542e-01  1.73938193e+00  2.48293669e-02
   4.49290320e-01  3.12121839e-01 -1.26745120e-01  1.51125087e+00
  -8.04334206e-01 -1.15645100e+00 -8.56001950e-01 -5.07197032e-01
   1.25782832e-03  9.05025048e-02  3.65839248e-01  3.64213321e-01
   5.41132856e-01  3.44550404e-01  1.06321387e+00  1.98848626e+00
   6.87888433e-01 -3.45370516e-01  3.12637168e-01  5.73755174e-01
   8.98974651e-01  6.11954243e-01 -2.46442008e-01 -1.25001028e+00
   5.78194

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [43]:
LGR.score(X_train2,Y_train)

0.5132801486607715

In [45]:
predict = LGR.predict(X_test2)
Counter(predict)

Counter({2: 30365, 1: 296, 3: 475, 0: 53, 4: 23})

In [46]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(Y_test,predict))
print(classification_report(Y_test,predict))

[[   15    59  1410    17     3]
 [    6   115  5258    70     4]
 [   13    77 15764   123     5]
 [   10    37  6202   185     7]
 [    9     8  1731    80     4]]
              precision    recall  f1-score   support

           0       0.28      0.01      0.02      1504
           1       0.39      0.02      0.04      5453
           2       0.52      0.99      0.68     15982
           3       0.39      0.03      0.05      6441
           4       0.17      0.00      0.00      1832

    accuracy                           0.52     31212
   macro avg       0.35      0.21      0.16     31212
weighted avg       0.44      0.52      0.37     31212



In [52]:
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X_train2,Y_train)
print(BNB.score(X_train2,Y_train))

0.5340814430347303


  y = column_or_1d(y, warn=True)


In [54]:
predict2 = BNB.predict(X_test2)
Counter(predict2)

Counter({2: 28020, 3: 1756, 1: 1131, 4: 151, 0: 154})

In [55]:
print(confusion_matrix(Y_test,predict2))
print(classification_report(Y_test,predict2))

[[   59   221  1176    42     6]
 [   59   437  4757   189    11]
 [   26   325 15165   445    21]
 [    9   118  5508   755    51]
 [    1    30  1414   325    62]]
              precision    recall  f1-score   support

           0       0.38      0.04      0.07      1504
           1       0.39      0.08      0.13      5453
           2       0.54      0.95      0.69     15982
           3       0.43      0.12      0.18      6441
           4       0.41      0.03      0.06      1832

    accuracy                           0.53     31212
   macro avg       0.43      0.24      0.23     31212
weighted avg       0.48      0.53      0.42     31212

