In [1]:
import pandas as pd
import numpy as np

In [2]:
reviews = pd.read_csv('IMDB_reviews.csv')

In [3]:
reviews.sample(10)

Unnamed: 0,review,sentiment
7292,"Fascinating movie, based on a true story, abou...",positive
29328,I just watched National Lampoon's Christmas Va...,negative
8756,I enjoyed the innocence of this film and how t...,positive
26355,"The sun was not shining, it was too wet to pla...",negative
40804,Ever notice how in his later movies Burt Reyno...,negative
9681,If you are 10 years old and never seen a movie...,negative
11605,Primal Species comes from B Movie legend Roger...,negative
3547,"I regard this loving, and sensitively written ...",positive
44323,The only thing that surprises me more than the...,negative
27908,When I was a kid I remembered this show but th...,positive


In [4]:
reviews.shape

(50000, 2)

In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


### TEXT CLEANING

In [6]:
# Removing html tags from data

# test code
import re
text = "The cast played Shakespeare.<br /><br />Shakespeare lost.<br /><br />I appreciate that this is trying to bring Shakespeare to the masses, but why ruin something so good.<br /><br />Is it because 'The Scottish Play' is my favorite Shakespeare? I do not know. What I do know is that a certain Rev Bowdler (hence bowdlerization) tried to do something similar in the Victorian era.<br /><br />In other words, you cannot improve perfection.<br /><br />I have no more to write but as I have to write at least ten lines of text (and English composition was never my forte I will just have to keep going and say that this movie, as the saying goes, just does not cut it."
clean = re.compile('<.*?>')
re.sub(clean,'',text)

"The cast played Shakespeare.Shakespeare lost.I appreciate that this is trying to bring Shakespeare to the masses, but why ruin something so good.Is it because 'The Scottish Play' is my favorite Shakespeare? I do not know. What I do know is that a certain Rev Bowdler (hence bowdlerization) tried to do something similar in the Victorian era.In other words, you cannot improve perfection.I have no more to write but as I have to write at least ten lines of text (and English composition was never my forte I will just have to keep going and say that this movie, as the saying goes, just does not cut it."

In [7]:
def remove_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

reviews['review'] = reviews['review'].apply(remove_tags)

In [8]:
reviews['review'][1]

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [9]:
# converting to lower case
def text_lower(text):
    return text.lower()

reviews['review'] = reviews['review'].apply(text_lower)

In [10]:
reviews['review'][1]

'a wonderful little production. the filming technique is very unassuming- very old-time-bbc fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. the actors are extremely well chosen- michael sheen not only "has got all the polari" but he has all the voices down pat too! you can truly see the seamless editing guided by the references to williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. a masterful production about one of the great master\'s of comedy and his life. the realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. it plays on our knowledge and our senses, particularly with the scenes concerning orton and halliwell and the sets (particularly of their flat with halliwell\'s murals decorating every surface) are terribly well done.'

In [11]:
# removing special chars
def remove_special_chars(text):
    x = ''
    
    for i in text:
        if i.isalnum():
            x = x+i
            
        else:
            x = x+' '
            
    return x



In [12]:
reviews['review'] = reviews['review'].apply(remove_special_chars)

In [13]:
reviews['review'][1]

'a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done '

In [14]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [15]:
def stemming(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [16]:
reviews['review'] = reviews['review'].apply(stemming)

In [17]:
reviews['review'][1]

'a wonder littl product the film techniqu is veri unassum veri old time bbc fashion and give a comfort and sometim discomfort sens of realism to the entir piec the actor are extrem well chosen michael sheen not onli ha got all the polari but he ha all the voic down pat too you can truli see the seamless edit guid by the refer to william diari entri not onli is it well worth the watch but it is a terrificli written and perform piec a master product about one of the great master s of comedi and hi life the realism realli come home with the littl thing the fantasi of the guard which rather than use the tradit dream techniqu remain solid then disappear it play on our knowledg and our sens particularli with the scene concern orton and halliwel and the set particularli of their flat with halliwel s mural decor everi surfac are terribl well done'

In [18]:
# removing stopwords and vectorizing

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english') # max features = 5000 will find 5000 most frequent words and form vector accordingly

In [20]:
X = cv.fit_transform(reviews['review']).toarray()

In [21]:
X.shape

(50000, 5000)

In [22]:
X[0].max()

6

In [23]:
def replace(text):
    if text == 'negative':
        return 0
    else:
        return 1

In [24]:
reviews['sentiment'] = reviews['sentiment'].apply(replace)

In [25]:
y = reviews['sentiment'].values

In [26]:
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [27]:
# train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [28]:
X_train.shape

(40000, 5000)

In [29]:
X_test.shape

(10000, 5000)

In [30]:
y_train.shape

(40000,)

In [31]:
y_test.shape

(10000,)

In [32]:
from sklearn.naive_bayes import BernoulliNB,GaussianNB,MultinomialNB
clf1 = BernoulliNB()
clf2 = GaussianNB()
clf3 = MultinomialNB()

In [33]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)


In [34]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)


In [35]:
from sklearn.metrics import accuracy_score


In [36]:
print("Bernoulli" , accuracy_score(y_test,y_pred1))
print("Gaussian" , accuracy_score(y_test,y_pred2))
print("Multi" , accuracy_score(y_test,y_pred3))


Bernoulli 0.8473
Gaussian 0.7218
Multi 0.8386


In [79]:
# we choose Bernoulli

In [38]:
import pickle

In [39]:
# Exporting count vectorizer object
pickle.dump(cv, open('countvector.pkl', 'wb'))

In [40]:
# creating pickle file for model.
pickle.dump(clf1, open('Model.pkl', 'wb'))