# Movie Review Sentiment Analysis

## Importing the libraries

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [9]:
dataset = pd.read_csv('IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Cleaning the texts

In [10]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', dataset['review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sounak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
print(corpus[0])

one review mention watch oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust not show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci not high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc not violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that g

## Creating the Bag of Words model

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[1 1 1 ... 0 0 0]


## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Gaussian Naive Bayes model on the Training set

In [14]:
from sklearn.naive_bayes import GaussianNB
classifierGaussian = GaussianNB()
classifierGaussian.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [15]:
y_pred = classifierGaussian.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 0]
 ...
 [1 1]
 [0 1]
 [0 0]]


## Making the Confusion Matrix, Classification Report & Accuracy Score

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred,target_names=['negative','positive']))
print(f'\nAccuracy score is : {accuracy_score(y_test, y_pred)*100}%')

[[4302  733]
 [1646 3319]]
              precision    recall  f1-score   support

    negative       0.72      0.85      0.78      5035
    positive       0.82      0.67      0.74      4965

    accuracy                           0.76     10000
   macro avg       0.77      0.76      0.76     10000
weighted avg       0.77      0.76      0.76     10000


Accuracy score is : 76.21%


## Training the Multinomial Naive Bayes model on the Training set

In [22]:
from sklearn.naive_bayes import MultinomialNB
classifierMultinomial = MultinomialNB()
classifierMultinomial.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Predicting the Test set results

In [23]:
y_pred = classifierMultinomial.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 0]
 ...
 [1 1]
 [0 1]
 [0 0]]


## Making the Confusion Matrix, Classification Report & Accuracy Score

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred,target_names=['negative','positive']))
print(f'\nAccuracy score is : {accuracy_score(y_test, y_pred)*100}%')

[[4196  839]
 [ 824 4141]]
              precision    recall  f1-score   support

    negative       0.84      0.83      0.83      5035
    positive       0.83      0.83      0.83      4965

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000


Accuracy score is : 83.37%


## Training the Support Linear Vector model on the Training set

In [27]:
from sklearn.svm import LinearSVC
classifierSVC = LinearSVC()
classifierSVC.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Predicting the Test set results

In [30]:
y_pred = classifierSVC.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 0]
 ...
 [1 1]
 [0 1]
 [0 0]]


## Making the Confusion Matrix, Classification Report & Accuracy Score

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred,target_names=['negative','positive']))
print(f'\nAccuracy score is : {accuracy_score(y_test, y_pred)*100}%')

[[4337  698]
 [ 583 4382]]
              precision    recall  f1-score   support

    negative       0.88      0.86      0.87      5035
    positive       0.86      0.88      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000


Accuracy score is : 87.19%


Thus, here we observe that model performance is in order of LinearSVC > MultinomialNB > GaussianNB