In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('IMDB_Dataset.csv')

In [3]:
df.shape

(50000, 2)

In [4]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

# Data Cleaning

In [5]:
#using 10000 sample data
# removing html tags
#removing special characters-Consistency
#making all the text to lower case
#removing stop words
#stemming-('playing','played','plays')---->('play')

In [6]:
df=df.sample(10000)

In [7]:
df.shape

(10000, 2)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 1466 to 18583
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
lc=LabelEncoder()

In [11]:
#df['sentiment']=lc.fit_transform(df['sentiment'])

In [12]:
df

Unnamed: 0,review,sentiment
1466,The is one of the worst spoofs I have ever see...,negative
40692,"Well now, this was certainly a surprise episod...",positive
36060,This is absolutely beyond question the worst m...,negative
12829,"Ah, how refreshing to see a vision of 18th cen...",positive
36395,As with a bunch of guys at school we must give...,positive
...,...,...
16484,Movie didn't have much plot and was uninterest...,negative
8663,This film is mediocre at best. Angie Harmon is...,negative
45041,You believe in God or you don't. You believe i...,negative
26835,"EXTREMITIES is the disturbing, yet riveting sc...",positive


In [13]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [14]:
df

Unnamed: 0,review,sentiment
1466,The is one of the worst spoofs I have ever see...,0
40692,"Well now, this was certainly a surprise episod...",1
36060,This is absolutely beyond question the worst m...,0
12829,"Ah, how refreshing to see a vision of 18th cen...",1
36395,As with a bunch of guys at school we must give...,1
...,...,...
16484,Movie didn't have much plot and was uninterest...,0
8663,This film is mediocre at best. Angie Harmon is...,0
45041,You believe in God or you don't. You believe i...,0
26835,"EXTREMITIES is the disturbing, yet riveting sc...",1


In [15]:
#BEfore removing html tags
df.iloc[2].review

"This is absolutely beyond question the worst movie I have ever seen. It is so bad in fact that I plan on renting it again as soon as I can find it. This movie makes 'Plan 9 From Outer Space' look like an Oscar contender. Just LOOKING at the actors makes me want to laugh out loud. I cannot say enough bad things about this movie. It's awfulness aproaches perfection.<br /><br />The plot is based on a terrorist attack with a nuclear weapon in San Francisco (I think). That's as far as I can go ... I am laughing too hard. I know it shouldn't be funny but ..... *LOLOLOLOLOLOLOLOL*<br /><br />MOVE OVER ED WOOD !!!<br /><br />Regard's *DATo*"

In [16]:
#using regex to remove html tags
import re
clean=re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

"This is absolutely beyond question the worst movie I have ever seen. It is so bad in fact that I plan on renting it again as soon as I can find it. This movie makes 'Plan 9 From Outer Space' look like an Oscar contender. Just LOOKING at the actors makes me want to laugh out loud. I cannot say enough bad things about this movie. It's awfulness aproaches perfection.The plot is based on a terrorist attack with a nuclear weapon in San Francisco (I think). That's as far as I can go ... I am laughing too hard. I know it shouldn't be funny but ..... *LOLOLOLOLOLOLOLOL*MOVE OVER ED WOOD !!!Regard's *DATo*"

In [17]:
def clean_html(text):
  clean=re.compile('<.*?>')
  return re.sub(clean,'',text)

In [18]:
df['review']=df['review'].apply(clean_html)

In [19]:
#converting everything to lowercase
def lower_text(text):
    return text.lower()

In [20]:
df['review']=df['review'].apply(lower_text)

In [21]:
df.iloc[2].review

"this is absolutely beyond question the worst movie i have ever seen. it is so bad in fact that i plan on renting it again as soon as i can find it. this movie makes 'plan 9 from outer space' look like an oscar contender. just looking at the actors makes me want to laugh out loud. i cannot say enough bad things about this movie. it's awfulness aproaches perfection.the plot is based on a terrorist attack with a nuclear weapon in san francisco (i think). that's as far as i can go ... i am laughing too hard. i know it shouldn't be funny but ..... *lolololololololol*move over ed wood !!!regard's *dato*"

In [22]:
#removing special characters
def remove_special(text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x

In [23]:
remove_special('th%e m@vie was ###3 bo****ring')

'th e m vie was    3 bo    ring'

In [24]:
df['review']=df['review'].apply(remove_special)

In [25]:
df

Unnamed: 0,review,sentiment
1466,the is one of the worst spoofs i have ever see...,0
40692,well now this was certainly a surprise episod...,1
36060,this is absolutely beyond question the worst m...,0
12829,ah how refreshing to see a vision of 18th cen...,1
36395,as with a bunch of guys at school we must give...,1
...,...,...
16484,movie didn t have much plot and was uninterest...,0
8663,this film is mediocre at best angie harmon is...,0
45041,you believe in god or you don t you believe i...,0
26835,extremities is the disturbing yet riveting sc...,1


In [26]:
#removing stop words using Natural language toolkit(NLP)
import nltk
from nltk.corpus import stopwords

In [27]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [None]:
df['review']=df['review'].apply(remove_stopwords)

In [None]:
df

In [None]:
#performing Stemming
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
        

In [None]:
stem_words(['I','loved','loving','it'])

In [None]:
df['review']=df['review'].apply(stem_words)

In [None]:
df

In [None]:
def join_back(list_input):
    return " ".join(list_input)

In [None]:
df['review']=df['review'].apply(join_back)

In [None]:
df['review']

In [None]:
X=df.iloc[:,0:1].values

In [None]:
X.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500)

In [None]:
X=cv.fit_transform(df['review']).toarray()

In [None]:
X

In [None]:
X.shape

In [None]:
y=df.iloc[:,-1].values

In [None]:
y.shape

# Training the Data

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
X_train.shape

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB,BernoulliNB

In [None]:
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

In [None]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)

In [None]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)

In [None]:
y_pred1.shape

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_test,y_pred1)

In [None]:
accuracy_score(y_test,y_pred2)

In [None]:
accuracy_score(y_test,y_pred3)