# IMDB Movie sentiment analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df=pd.read_csv('IMDB Dataset.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(50000, 2)

In [6]:
# create a column where 1 means positive and 0 means negative
df['category']=df['sentiment'].apply(lambda x:1 if x=='positive' else 0)

In [7]:
df.head()

Unnamed: 0,review,sentiment,category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [8]:
# check if target labels are balanced
df['category'].value_counts()

1    25000
0    25000
Name: category, dtype: int64

In [9]:
# split the data
X=df['review']
y=df['category']

In [10]:
X,y

(0        One of the other reviewers has mentioned that ...
 1        A wonderful little production. <br /><br />The...
 2        I thought this was a wonderful way to spend ti...
 3        Basically there's a family where a little boy ...
 4        Petter Mattei's "Love in the Time of Money" is...
                                ...                        
 49995    I thought this movie did a down right good job...
 49996    Bad plot, bad dialogue, bad acting, idiotic di...
 49997    I am a Catholic taught in parochial elementary...
 49998    I'm going to have to disagree with the previou...
 49999    No one expects the Star Trek movies to be high...
 Name: review, Length: 50000, dtype: object,
 0        1
 1        1
 2        1
 3        0
 4        1
         ..
 49995    1
 49996    0
 49997    0
 49998    0
 49999    0
 Name: category, Length: 50000, dtype: int64)

In [11]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [12]:
x_train.shape, y_train.shape,x_test.shape

((40000,), (40000,), (10000,))

In [13]:
# from text to numeric value, use countvectorizer (Bag of Words)
v=CountVectorizer()

In [14]:
x_train_new=v.fit_transform(x_train.values)

In [46]:
type(x_train.values) # type of x_train

numpy.ndarray

In [15]:
x_train_new.shape

(40000, 92833)

In [16]:
# train naive bayes model
from sklearn.naive_bayes import MultinomialNB

In [17]:
model=MultinomialNB()
model.fit(x_train_new,y_train)

In [18]:
x_test_new = v.transform(x_test)

In [19]:
x_test_new

<10000x92833 sparse matrix of type '<class 'numpy.int64'>'
	with 1359256 stored elements in Compressed Sparse Row format>

In [20]:
# Performance Evauation

In [21]:
from sklearn.metrics import classification_report

In [22]:
y_pred = model.predict(x_test_new)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4965
           1       0.87      0.81      0.84      5035

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.84     10000
weighted avg       0.85      0.85      0.84     10000



In [27]:
from sklearn.pipeline import Pipeline

In [28]:
Model = Pipeline([
    ('v', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:
Model.fit(x_train, y_train)

In [30]:
y_pred=Model.predict(x_test)

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4965
           1       0.87      0.81      0.84      5035

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.84     10000
weighted avg       0.85      0.85      0.84     10000

