## Load Dataset-

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("IMDB Dataset.csv")

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
X = data["review"]
Y = data["sentiment"]

## Split the data into train and test dataset

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33)

In [8]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(33500,) (16500,) (33500,) (16500,)


In [9]:
print(X_train)

38984    Holy cow, what a piece of sh*t this movie is. ...
5977     This if the first movie I've given a 10 to in ...
9936     I thought it was one of the best sequels I hav...
23750    Where is it written that sequels must suck? Sc...
9021     This couldn't have been better. The strong res...
                               ...                        
35128    This movie seems as if someone had a cute idea...
16669    Based on a True Story . . .<br /><br />The pre...
41014    I had been looking forward to How to Lose Frie...
36153    There are movies that are leaders, and movies ...
13885    First,I'll give my rating for the series overa...
Name: review, Length: 33500, dtype: object


In [10]:
print(y_train)

38984    negative
5977     positive
9936     positive
23750    negative
9021     positive
           ...   
35128    negative
16669    positive
41014    positive
36153    negative
13885    positive
Name: sentiment, Length: 33500, dtype: object


## Create a NLP Pipeline to clean reviews data
### - Tokenize
### - Remove Stopwords
### - Perform Stemming

In [11]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [12]:
#init objects
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [13]:
def getStemmedReview(review):
    review = review.lower()
    review = review.replace('<br /><br />',' ')
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review
    

In [14]:
sample_text = """ One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO."""

In [15]:
getStemmedReview(sample_text)

'one review mention watch 1 oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go'

In [16]:
x_train = [getStemmedReview(review) for review in X_train]
x_test = [getStemmedReview(review) for review in X_test]

In [17]:
x_train

['holi cow piec sh movi filmmak could take 250 word book turn movi guess know either rememb fart belch book took time children classic ad fart belch sexual inuindo prostitut kaka joke give good idea hollywood produc think like say visual interest brilliant visual stori ruin toilet humor even think kind thing funni want kid know think take kid see rent dvd hope ghost doctor suess ghost come haunt peopl made movi',
 'first movi given 10 year ever movi need word mouth promot 4 mil box disgrac peopl know appreci blue good use excel music alon reason go see mani peopl knew jackson could sing damn fine hear book movi taunt salvat see never abl forgiv trivial use word ye gritti sexi home truth bizarr face real best reason see movi get mean stay away see anoth week',
 'thought one best sequel seen sometim felt though would want someon die stanley kill annoy charact brilliant well done movi happi die problem scene look like someon home camera film weird judd nelson cute least opinion excel role

In [18]:
print(len(x_train))
print(len(x_test))

33500
16500


## Vectorization

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer(ngram_range = (1,2))
x_vec = cv.fit_transform(x_train)

In [21]:
print(x_vec)

  (0, 835931)	1
  (0, 396464)	1
  (0, 1313288)	1
  (0, 1567280)	1
  (0, 1159022)	3
  (0, 659372)	1
  (0, 387883)	1
  (0, 1732020)	2
  (0, 13421)	1
  (0, 1958310)	1
  (0, 213476)	2
  (0, 1828823)	1
  (0, 781961)	1
  (0, 964493)	2
  (0, 539059)	1
  (0, 1444280)	1
  (0, 631062)	2
  (0, 177020)	2
  (0, 1797366)	1
  (0, 1783415)	1
  (0, 308056)	1
  (0, 323596)	1
  (0, 40896)	1
  (0, 1566528)	1
  (0, 912028)	1
  :	:
  (33499, 1291936)	1
  (33499, 1326590)	1
  (33499, 1802403)	1
  (33499, 648280)	1
  (33499, 957907)	1
  (33499, 648312)	1
  (33499, 1153976)	1
  (33499, 9031)	1
  (33499, 1009746)	1
  (33499, 931080)	1
  (33499, 118934)	1
  (33499, 213755)	1
  (33499, 1030344)	1
  (33499, 260783)	1
  (33499, 340283)	1
  (33499, 1942951)	1
  (33499, 1292017)	1
  (33499, 1761147)	1
  (33499, 1327288)	1
  (33499, 862735)	1
  (33499, 948928)	1
  (33499, 679004)	1
  (33499, 243753)	1
  (33499, 1863652)	1
  (33499, 305883)	1


In [22]:
x_vec.shape

(33500, 1993213)

In [23]:
print(len(cv.get_feature_names()))

1993213


In [24]:
x_test_vec = cv.transform(x_test)

In [25]:
print(x_test_vec.shape)

(16500, 1993213)


## multinomial Naive Bayes

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
mnb = MultinomialNB()

In [28]:
print(mnb)

MultinomialNB()


In [29]:
mnb.fit(x_vec,y_train)

MultinomialNB()

In [30]:
pred = mnb.predict(x_test_vec)

In [31]:
pred

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [32]:
mnb.score(x_vec,y_train)

0.9954925373134328

In [33]:
mnb.predict_proba(x_test_vec)

array([[1.25086854e-21, 1.00000000e+00],
       [6.36019308e-06, 9.99993640e-01],
       [1.93833929e-08, 9.99999981e-01],
       ...,
       [4.85635440e-24, 1.00000000e+00],
       [6.93567376e-03, 9.93064326e-01],
       [1.49494850e-42, 1.00000000e+00]])

In [34]:
mnb.score(x_test_vec,y_test)

0.8812121212121212

## Multivariant Bernoulli Event Model Naive Bayes

In [35]:
from sklearn.naive_bayes import BernoulliNB

In [36]:
bnb = BernoulliNB()

In [37]:
print(bnb)

BernoulliNB()


In [38]:
bnb.fit(x_vec,y_train)

BernoulliNB()

In [39]:
pred2 = bnb.predict(x_test_vec)

In [40]:
pred2

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'positive'], dtype='<U8')

In [41]:
bnb.score(x_vec,y_train)

0.9941791044776119

In [42]:
bnb.score(x_test_vec,y_test)

0.8698181818181818