### Libraries Required

In [1]:
import numpy as np
import pandas as pd

In [6]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.shape

(50000, 2)

In [8]:
df=df.iloc[:10000]

In [9]:
df.shape

(10000, 2)

In [10]:
 df['review'][2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [11]:
df['sentiment'].value_counts()

positive    5028
negative    4972
Name: sentiment, dtype: int64

In [12]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df.shape

(9983, 2)

### Preprocessing

In [15]:
#remove html tags

import re

def remove_tag(raw_text):
    cleaned_text=re.sub(re.compile('<.*?>'),'',raw_text)
    return cleaned_text

In [16]:
df['review']=df['review'].apply(remove_tag)

In [17]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [18]:
#lower case

df['review']=df['review'].apply(lambda x:x.lower())

In [19]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [25]:
#remove stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prerana\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [26]:
from nltk.corpus import stopwords
sw_list=stopwords.words('english')

In [29]:
df['review']=df['review'].apply(lambda x:[i for i in x.split() if i not in sw_list]).apply(lambda x:" ".join(x))

In [30]:
x=df.iloc[:,0:1]

In [31]:
x

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. anyone say ""good hockey movie""? kn..."
9997,movie bad movie. watching endless series bad h...
9998,"movie probably made entertain middle school, e..."


In [32]:
y=df['sentiment']
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
encode=LabelEncoder()

y=encode.fit_transform(y)

In [35]:
y

array([1, 1, 1, ..., 0, 0, 1])

## Model Building

In [36]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=33,stratify=y,test_size=0.2)

In [37]:
#apply bow

from sklearn.feature_extraction.text import CountVectorizer

In [38]:
cv=CountVectorizer()

In [41]:
x_train_bow=cv.fit_transform(x_train['review']).toarray()
x_test_bow=cv.transform(x_test['review']).toarray()

In [42]:
x_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
from sklearn.naive_bayes import GaussianNB

gnb=GaussianNB()

gnb.fit(x_train_bow,y_train)

In [44]:
y_pred=gnb.predict(x_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_test,y_pred)

0.6219328993490235

In [45]:
confusion_matrix(y_test,y_pred)

array([[706, 286],
       [469, 536]], dtype=int64)

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()

rf.fit(x_train_bow,y_train)

In [47]:
y_pred=rf.predict(x_test_bow)

accuracy_score(y_test,y_pred)

0.85678517776665