In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
oData = pd.read_csv('/content/labeledTrainData.tsv',delimiter='\t',quoting=3)

In [11]:
oData.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [12]:
# checking the data type of data
oData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [13]:
# checking the null value is there in data or not ...
oData.isnull().sum()

Unnamed: 0,0
id,0
sentiment,0
review,0


We See there is no any null value

In [14]:
### check the len,shape and column wise len of the data set
print(len(oData))
print(oData.shape)
print(len(oData['review']))

25000
(25000, 3)
25000


In [15]:
#  taking the stop word from the nlp and downloding them
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Now We can see the stop word are download

In [22]:
print(type(stopwords.words('english')))
print(len(stopwords.words('english')))

<class 'list'>
198


In [24]:
for i in range(len(stopwords.words('english'))):
  print(i,stopwords.words('english')[i])

0 a
1 about
2 above
3 after
4 again
5 against
6 ain
7 all
8 am
9 an
10 and
11 any
12 are
13 aren
14 aren't
15 as
16 at
17 be
18 because
19 been
20 before
21 being
22 below
23 between
24 both
25 but
26 by
27 can
28 couldn
29 couldn't
30 d
31 did
32 didn
33 didn't
34 do
35 does
36 doesn
37 doesn't
38 doing
39 don
40 don't
41 down
42 during
43 each
44 few
45 for
46 from
47 further
48 had
49 hadn
50 hadn't
51 has
52 hasn
53 hasn't
54 have
55 haven
56 haven't
57 having
58 he
59 he'd
60 he'll
61 her
62 here
63 hers
64 herself
65 he's
66 him
67 himself
68 his
69 how
70 i
71 i'd
72 if
73 i'll
74 i'm
75 in
76 into
77 is
78 isn
79 isn't
80 it
81 it'd
82 it'll
83 it's
84 its
85 itself
86 i've
87 just
88 ll
89 m
90 ma
91 me
92 mightn
93 mightn't
94 more
95 most
96 mustn
97 mustn't
98 my
99 myself
100 needn
101 needn't
102 no
103 nor
104 not
105 now
106 o
107 of
108 off
109 on
110 once
111 only
112 or
113 other
114 our
115 ours
116 ourselves
117 out
118 over
119 own
120 re
121 s
122 same
123 shan
1

In [40]:
sSample = oData['review'][0]
sSample

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [46]:
#### creating the normal and plan text from the sSmaple
sSmaple = BeautifulSoup(sSample).get_text()
sSmaple

'With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay  br    br   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him  br    br   The actual feature film bit when it finally sta

In [47]:
# removing the extra text
sSample = re.sub("[^a-zA-Z]",' ',sSample)
sSample

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay  br    br   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him  br    br   The actual feature film bit when it finally st

In [52]:
# Converting to the lower case
sSmaple = sSmaple.lower()

In [53]:
# final string Data
sSample

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay  br    br   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him  br    br   The actual feature film bit when it finally st

In [54]:
# Spliting the text in list form
split_sSample = sSample.split()

In [56]:
len(split_sSample)

445

In [57]:
# sample without stop word
sStopWord = set(stopwords.words('english'))
sSample = [w for w in split_sSample if not w in sStopWord]

In [60]:
print("len of sSmaple after removing stop word :",len(sSample))

len of sSmaple after removing stop word : 239


In [62]:
def fDataClean(sText):
  # Without html
  sText = BeautifulSoup(sText).get_text()
  sText = re.sub("[^a-zA-Z]",' ',sText)
  sText = sText.lower()
  sText = sText.split()
  sStopWord = set(stopwords.words('english'))
  sText = [w for w in sText if not w in sStopWord]

  return (" ".join(sText))


In [67]:
# checking the process of text dara
train_x_tum = []
for i in range(len(oData['review'])):
  if (i+1)%1000 == 0:
    print("No Of Reviews processed = ",i+1)
  train_x_tum.append(fDataClean(oData['review'][i]))

No Of Reviews processed =  1000
No Of Reviews processed =  2000
No Of Reviews processed =  3000
No Of Reviews processed =  4000
No Of Reviews processed =  5000
No Of Reviews processed =  6000
No Of Reviews processed =  7000
No Of Reviews processed =  8000
No Of Reviews processed =  9000
No Of Reviews processed =  10000
No Of Reviews processed =  11000
No Of Reviews processed =  12000
No Of Reviews processed =  13000
No Of Reviews processed =  14000
No Of Reviews processed =  15000
No Of Reviews processed =  16000
No Of Reviews processed =  17000
No Of Reviews processed =  18000
No Of Reviews processed =  19000
No Of Reviews processed =  20000
No Of Reviews processed =  21000
No Of Reviews processed =  22000
No Of Reviews processed =  23000
No Of Reviews processed =  24000
No Of Reviews processed =  25000


In [64]:
oData['clean_review'] = oData['review'].apply(lambda x: fDataClean(x))

In [66]:
oData[['clean_review','review']]

Unnamed: 0,clean_review,review
0,stuff going moment mj started listening music ...,"""With all this stuff going down at the moment ..."
1,classic war worlds timothy hines entertaining ...,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,film starts manager nicholas bell giving welco...,"""The film starts with a manager (Nicholas Bell..."
3,must assumed praised film greatest filmed oper...,"""It must be assumed that those who praised thi..."
4,superbly trashy wondrously unpretentious explo...,"""Superbly trashy and wondrously unpretentious ..."
...,...,...
24995,seems like consideration gone imdb reviews fil...,"""It seems like more consideration has gone int..."
24996,believe made film completely unnecessary first...,"""I don't believe they made this film. Complete..."
24997,guy loser get girls needs build picked stronge...,"""Guy is a loser. Can't get girls, needs to bui..."
24998,minute documentary bu uel made early one spain...,"""This 30 minute documentary Buñuel made in the..."


In [73]:
print(type(train_x_tum))
print(len(train_x_tum))
print(train_x_tum[0])

<class 'list'>
25000
stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually d

In [74]:
#### train and test split of data
x = train_x_tum
y = np.array(oData['sentiment'])

In [80]:
# train and test split of data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [81]:
print("x_train = ",len(x_train))
print("x_test = ",len(x_test))
print("y_train =",len(y_train))
print("y_test =",len(y_test))

x_train =  17500
x_test =  7500
y_train = 17500
y_test = 7500


In [82]:
# creating a bag of word using vectorizer
vectorizer = CountVectorizer(max_features=5000)

x_train_bow = vectorizer.fit_transform(x_train)



In [85]:
x_train_bow = x_train_bow.toarray()

In [92]:
print(x_train_bow)
print(y_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 1 1 ... 1 1 0]


In [95]:
# checking the shape
print(x_train_bow.shape)
print(y_train.shape)

(17500, 5000)
(17500,)


In [97]:
randomModel = RandomForestClassifier()

In [98]:
# train the data with random forest classifier
randomModel.fit(x_train_bow,y_train)

In [101]:
x_test_bow = vectorizer.fit_transform(x_test)

In [102]:
x_test_bow = x_test_bow.toarray()

In [103]:
x_test_bow.shape

(7500, 5000)

In [104]:
# now time to fr test data
y_pred = randomModel.predict(x_test_bow)

In [111]:
y_pred

array([0, 0, 0, ..., 0, 1, 1])

In [112]:
y_test

array([0, 1, 0, ..., 0, 0, 0])

In [113]:
error = mean_squared_error(y_test,y_pred)

In [116]:
print("Mean squred Error :- ",error*100 ,"%")

Mean squred Error :-  41.86666666666667 %


In [117]:
acc = roc_auc_score(y_pred,y_test)

In [119]:
print("Accuracy : ",acc*100,"%")

Accuracy :  58.390715270156754 %


In [122]:
randomModel.score(x_test_bow,y_test)

0.5813333333333334