In [40]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from sklearn.metrics import accuracy_score
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('IMDB-Dataset.csv')
df.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [3]:
df.shape

(50000, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
df.sentiment.replace({'positive':1,'negative':0},inplace = True)

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# STEPS TO CLEAN THE REVIEWS :
Remove HTML tags<br>
Remove special characters<br>
Convert everything to lowercase<br>
Remove stopwords<br>
Stemming<br>

In [8]:
df.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Remove HTML tags

In [9]:
def clean_html_tags(mytxt):
    pattern = re.compile(r'<.*?>')
    return re.sub(pattern,'',mytxt)

In [10]:
clean_html_tags(df.review[0])

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [11]:
df.review = df.review.apply(clean_html_tags)

In [12]:
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

In [13]:
is_special(df.review[0])

'One of the other reviewers has mentioned that after watching just 1 Oz episode you ll be hooked  They are right  as this is exactly what happened with me The first thing that struck me about Oz was its brutality and unflinching scenes of violence  which set in right from the word GO  Trust me  this is not a show for the faint hearted or timid  This show pulls no punches with regards to drugs  sex or violence  Its is hardcore  in the classic use of the word It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary  It focuses mainly on Emerald City  an experimental section of the prison where all the cells have glass fronts and face inwards  so privacy is not high on the agenda  Em City is home to many  Aryans  Muslims  gangstas  Latinos  Christians  Italians  Irish and more    so scuffles  death stares  dodgy dealings and shady agreements are never far away I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [14]:
df.review = df.review.apply(is_special)

In [15]:
#stop words clean up 

def stop_words_clean_up(mytxt):
    stop_words = set(stopwords.words('english'))
    punct = set(string.punctuation)
    words = word_tokenize(mytxt.lower())
    return [word for word in words if word not in stop_words and word not in punct]     
    

In [16]:
df.review = df.review.apply(stop_words_clean_up)

In [61]:
def stem_convert(mytxt):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(txt) for txt in mytxt])

In [18]:
df.review = df.review.apply(stem_convert)

In [19]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [24]:
#model Creation
x = np.array(df.iloc[:,0].values)
y = np.array(df.sentiment.values)

In [26]:
cv = CountVectorizer(max_features= 1000)

In [27]:
X = cv.fit_transform(df.review).toarray()

In [28]:
print(X.shape)
print(y.shape)

(50000, 1000)
(50000,)


In [33]:
cv_df = pd.DataFrame(data=X,columns = cv.get_feature_names())
print(cv_df)

       10  20  30  80  abil  abl  absolut  accent  accept  achiev  ...  \
0       0   0   0   0     0    0        0       0       0       0  ...   
1       0   0   0   0     0    0        0       0       0       0  ...   
2       0   0   0   0     0    0        0       0       0       0  ...   
3       1   0   0   0     0    0        0       0       0       0  ...   
4       0   0   0   0     0    0        0       0       0       0  ...   
...    ..  ..  ..  ..   ...  ...      ...     ...     ...     ...  ...   
49995   1   1   0   0     0    0        0       0       0       0  ...   
49996   0   0   0   0     0    0        0       0       0       0  ...   
49997   0   0   0   0     0    0        0       0       0       0  ...   
49998   0   0   1   0     0    0        0       0       0       0  ...   
49999   0   0   0   0     0    0        0       0       0       0  ...   

       writer  written  wrong  wrote  year  yes  yet  york  young  zombi  
0           0        0      0      0



In [30]:
#train_test_split
trainx,testx,trainy,testy = train_test_split(X,y,test_size=0.2,random_state=9)
print("Train shapes : X = {}, y = {}".format(trainx.shape,trainy.shape))
print("Test shapes : X = {}, y = {}".format(testx.shape,testy.shape))

Train shapes : X = (40000, 1000), y = (40000,)
Test shapes : X = (10000, 1000), y = (10000,)


In [35]:
print(cv.vocabulary_)
#https://towardsdatascience.com/basics-of-countvectorizer-e26677900f9c#:~:text=The%20CountVectorizer%20will%20select%20the,common%20words%20in%20the%20data.&text=By%20setting%20'binary%20%3D%20True',frequency%20of%20the%20term%2Fword.

{'one': 612, 'review': 722, 'mention': 555, 'watch': 955, 'episod': 282, 'right': 727, 'exact': 296, 'happen': 403, 'first': 345, 'thing': 882, 'scene': 745, 'violenc': 945, 'set': 769, 'word': 982, 'go': 386, 'show': 779, 'heart': 411, 'pull': 682, 'drug': 256, 'sex': 771, 'classic': 151, 'use': 932, 'call': 116, 'given': 385, 'state': 827, 'focus': 352, 'main': 529, 'citi': 148, 'prison': 672, 'front': 366, 'face': 310, 'high': 415, 'home': 423, 'mani': 536, 'italian': 462, 'death': 214, 'deal': 213, 'never': 588, 'far': 322, 'away': 72, 'would': 988, 'say': 742, 'appeal': 52, 'due': 257, 'fact': 311, 'goe': 388, 'forget': 356, 'pretti': 670, 'pictur': 643, 'audienc': 67, 'charm': 138, 'romanc': 733, 'mess': 556, 'around': 56, 'ever': 291, 'saw': 741, 'develop': 230, 'tast': 869, 'got': 392, 'level': 504, 'kill': 477, 'order': 615, 'get': 380, 'well': 962, 'manner': 537, 'middl': 559, 'class': 150, 'turn': 916, 'lack': 485, 'street': 840, 'skill': 793, 'experi': 304, 'may': 546, 'bec

In [36]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [37]:
gnb, mnb, bnb = GaussianNB(),MultinomialNB(alpha=1.0,fit_prior=True),BernoulliNB(alpha=1.0,fit_prior=True)

In [38]:
gnb.fit(trainx,trainy)
mnb.fit(trainx,trainy)
bnb.fit(trainx,trainy)
ypg = gnb.predict(testx)
ypm = mnb.predict(testx)
ypb = bnb.predict(testx)

In [41]:
print("Gaussian = ",accuracy_score(testy,ypg))
print("Multinomial = ",accuracy_score(testy,ypm))
print("Bernoulli = ",accuracy_score(testy,ypb))

Gaussian =  0.7843
Multinomial =  0.831
Bernoulli =  0.8386


In [42]:
pickle.dump(bnb,open('MRmodel.pkl','wb'))

In [76]:
rev2 = '''One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.'''

In [43]:
rev =  """Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.

So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. The film was a chaotic mish mash of action elements and failed 'set pieces'...

I found the villain to be quite amusing.

And now I give up. This movie is not robbing any more of my time but I felt I ought to contribute to restoring the obvious fake rating and reviews this movie has been getting on IMDb."""

In [77]:
stage1 = clean_html_tags(rev2)
stage2 = is_special(stage1)
stage3 = stop_words_clean_up(stage2)
stage4 = stem_convert(stage3)

In [52]:
word_vocabulary = cv.vocabulary_

In [47]:
pickle.dump(word_vocabulary,open('word_vocabulary.pkl','wb'))

In [63]:
stage4.count()

'terribl complet trash brainless tripe insult anyon 8 year old fan boy im actual pretti disgust movi make money say peopl brainless hand hard earn cash entertain fashion come leav posit 8 8 review oh yes moron sensibl conclus draw anyon rate movi amongst pantheon great titl beyond tri find someth construct say titl hard enjoy iron man toni stark inspir charact movi pale shadow hook movi wonder iron man would knock captain america oh wish charact anyway useless bicker idiot realli organis happi time breweri film chaotic mish mash action element fail set piec found villain quit amus give movi rob time felt ought contribut restor obvious fake rate review movi get imdb'

In [78]:
inp = []
for i in word_vocabulary:
    inp.append(stage4.count(i[0]))
y_pred = bnb.predict(np.array(inp).reshape(1,1000))

In [79]:
if y_pred:
    print("Positive command")
else:
    print("Neagtive command")

Positive command
