## Load Dataset-

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("IMDB Dataset.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
X = data["review"]
Y = data["sentiment"]

## Split the data into train and test dataset

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.33)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(33500,) (16500,) (33500,) (16500,)


In [10]:
print(X_train)

32566    Well, this might be one of the funniest movies...
37480    it's embarrassing I had like 3 minutes on my w...
9610     I used to write comments at IMDb, but I don't ...
18324    This was a excellent movie. I deal with a chil...
33105    I know if I was a low budget film maker I woul...
                               ...                        
29636    ...had I watched it in my teenage years. This ...
11878    Boy oh boy oh golly gee,<br /><br />The most i...
33755    This film is replete with sentimentality, unpr...
39168    One of the more obscure of Anthony Mann's West...
36051    A SPECIAL DAY (Ettore Scola - Italy/Canada 197...
Name: review, Length: 33500, dtype: object


In [11]:
print(y_train)

32566    positive
37480    negative
9610     positive
18324    positive
33105    negative
           ...   
29636    negative
11878    negative
33755    negative
39168    positive
36051    positive
Name: sentiment, Length: 33500, dtype: object


## Create a NLP Pipeline to clean reviews data
### - Tokenize
### - Remove Stopwords
### - Perform Stemming

In [12]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [13]:
#init objects
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [14]:
def getStemmedReview(review):
    review = review.lower()
    review = review.replace('<br /><br />',' ')
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review
    

In [15]:
sample_text = """ One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO."""

In [16]:
getStemmedReview(sample_text)

'one review mention watch 1 oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go'

In [17]:
x_train = [getStemmedReview(review) for review in X_train]
x_test = [getStemmedReview(review) for review in X_test]

In [18]:
#x_train

In [19]:
print(len(x_train))
print(len(x_test))

33500
16500


## Vectorization

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
cv = CountVectorizer(ngram_range = (1,2))
x_vec = cv.fit_transform(x_train)

In [22]:
#print(x_vec)

In [23]:
x_vec.shape

(33500, 1986399)

In [24]:
print(len(cv.get_feature_names()))

1986399


In [25]:
x_test_vec = cv.transform(x_test)

In [26]:
print(x_test_vec.shape)

(16500, 1986399)


## multinomial Naive Bayes

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
mnb = MultinomialNB()

In [29]:
print(mnb)

MultinomialNB()


In [30]:
mnb.fit(x_vec,y_train)

MultinomialNB()

In [31]:
pred = mnb.predict(x_test_vec)

In [32]:
pred

array(['negative', 'negative', 'positive', ..., 'negative', 'negative',
       'positive'], dtype='<U8')

In [33]:
mnb.score(x_vec,y_train)

0.9953731343283582

In [34]:
mnb.predict_proba(x_test_vec)

array([[1.00000000e+00, 3.94594220e-17],
       [1.00000000e+00, 4.02967995e-25],
       [7.12029857e-09, 9.99999993e-01],
       ...,
       [1.00000000e+00, 4.05825130e-62],
       [1.00000000e+00, 5.79395905e-20],
       [2.91666203e-21, 1.00000000e+00]])

In [35]:
mnb.score(x_test_vec,y_test)

0.8787272727272727

## Multivariant Bernoulli Event Model Naive Bayes

In [36]:
from sklearn.naive_bayes import BernoulliNB

In [37]:
bnb = BernoulliNB()

In [38]:
print(bnb)

BernoulliNB()


In [39]:
bnb.fit(x_vec,y_train)

BernoulliNB()

In [40]:
pred2 = bnb.predict(x_test_vec)

In [41]:
pred2

array(['negative', 'negative', 'positive', ..., 'negative', 'negative',
       'positive'], dtype='<U8')

In [42]:
bnb.score(x_vec,y_train)

0.9936417910447761

In [43]:
bnb.score(x_test_vec,y_test)

0.866909090909091