## Movie Review Classification with Naive Bayes Classifier in Python

### author: Yi Rong 
### date: 07/14/21

In [1]:
import pandas as pd
import re
import pdb
import json
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.decomposition import PCA
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import time

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

#### Problem	1 – Movie	Classification

a) Data Input

In [253]:
# put movie reviews into a dictionary
dict_review = {}
dict_review[1] = "\"innovative action movie sequences\" - I don't know, it all seem like crap from a Power Rangers episode."
dict_review[2] = "And what is wrong with focusing on the messages of a film?"
dict_review[3] = "I hated the movie and I really wanted to like it."
dict_review[4] = "Amazing actually." 
dict_review[5] = "Honestly, I still found it to be the best of the current wave of live action Disney movie remakes."
dict_review[6] = "I think amazing action movie."
dict_review[7] = "I really liked it."

b) Data Preparation

In [254]:
# clean text data
porter = PorterStemmer()
def get_clean_text(text):
    # remove extra whitespace
    striptext = text.replace('\n\n', ' ')
    striptext = striptext.replace('\n', ' ')
    
    # split text into words
    L_words = word_tokenize(striptext)
    
    # convert word to lowercase
    L_lowercase_words = [word.lower() for word in L_words]
    
    # exclude all the stopwords and non-alphabets words
    L_ignore = []
    L_drop_stop_nonalpha = [word for word in L_lowercase_words 
                            if word not in stopwords.words() and word not in L_ignore and word.isalpha()]
    
    # stemming
    L_stemming = [porter.stem(word) for word in L_drop_stop_nonalpha]
    
    return L_stemming

In [255]:
# clean all movie reviews
L_clean_review = []
for key in dict_review.keys():
    L_clean_review.append(get_clean_text(dict_review[key]))

In [256]:
# join words together for each review
str1 = " " 
L_comp_review = []
for i in range(len(L_clean_review)):
    L_comp_review.append(" ".join(L_clean_review[i]))

In [257]:
# Please find cleaned text below:
L_comp_review

['innov action movi sequenc know seem like crap power ranger episod',
 'wrong focus messag film',
 'hate movi realli want like',
 'amaz actual',
 'honestli still found best current wave live action disney movi remak',
 'think amaz action movi',
 'realli like']

c) Data Conversion

In [258]:
# convert words to vector using Bag of Words
vectorizer2 = CountVectorizer(analyzer='word', 
                              min_df=1, # minimum reqd occurences of a word
                              ngram_range=(1, 1)) # ngram = 1 or 2
X2 = vectorizer2.fit_transform(L_comp_review)
df_bow2 = pd.DataFrame(X2.toarray(), columns=vectorizer2.get_feature_names())
df_bow2.index = list(dict_review.keys())
df_bow2.index.name = 'Review Number'

In [259]:
# Please find Word of Bages examples below:
df_bow2

Unnamed: 0_level_0,action,actual,amaz,best,crap,current,disney,episod,film,focus,...,ranger,realli,remak,seem,sequenc,still,think,want,wave,wrong
Review Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,1,0,0,1,0,0,...,1,0,0,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,1,0,1,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
6,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


d) Train Naive Bayes Classifier

In [260]:
# assign labels for each review, 0 for negative and 1 for positive
df_nbc = df_bow2

# I think it is easy to distinguish 1, 3, 4, 5. For review 2, I checked the original review and
# it is experssing a admiration to movie's messages, so it is positive
df_nbc["_class"] = [0, 1, 0, 1, 1, None, None]

In [261]:
# compute the probability of each word given positive review
df_nbc_pos = df_nbc[df_nbc["_class"] == 1] # positive review
df_nbc_pos = df_nbc_pos.T 
df_nbc_pos = df_nbc_pos[:-1] # delete last row "_class"

# compute based on naive bayes classifier formula
df_nbc_pos["prob_pos"] = (df_nbc_pos.sum(axis = 1) + 1) / (df_nbc_pos.sum().sum() + df_nbc_pos.shape[0])

In [248]:
df_nbc_pos

Review Number,2,4,5,prob_pos
action,0.0,0.0,1.0,0.042553
actual,0.0,1.0,0.0,0.042553
amaz,0.0,1.0,0.0,0.042553
best,0.0,0.0,1.0,0.042553
crap,0.0,0.0,0.0,0.021277
current,0.0,0.0,1.0,0.042553
disney,0.0,0.0,1.0,0.042553
episod,0.0,0.0,0.0,0.021277
film,1.0,0.0,0.0,0.042553
focus,1.0,0.0,0.0,0.042553


In [262]:
# compute the probability of each word given negative review
df_nbc_neg = df_nbc[df_nbc["_class"] == 0]# negative review
df_nbc_neg = df_nbc_neg.T
df_nbc_neg = df_nbc_neg[:-1]# delete last row "_class"

# compute based on classifier formula for each word
df_nbc_neg["prob_neg"] = (df_nbc_neg.sum(axis = 1) + 1) / (df_nbc_neg.sum().sum() + df_nbc_neg.shape[0])

In [273]:
df_nbc_neg

Review Number,1,3,prob_neg
action,1.0,0.0,0.043478
actual,0.0,0.0,0.021739
amaz,0.0,0.0,0.021739
best,0.0,0.0,0.021739
crap,1.0,0.0,0.043478
current,0.0,0.0,0.021739
disney,0.0,0.0,0.021739
episod,1.0,0.0,0.043478
film,0.0,0.0,0.021739
focus,0.0,0.0,0.021739


e) Prediction

In [263]:
# compute probability for positive and negative
prob_test_pos = sum(df_nbc["_class"] == 1) / (sum(df_nbc["_class"] == 1) + sum(df_nbc["_class"] == 0))
prob_test_neg = sum(df_nbc["_class"] == 0) / (sum(df_nbc["_class"] == 1) + sum(df_nbc["_class"] == 0))

In [272]:
prob_test_pos, prob_test_neg

(0.6, 0.4)

In [264]:
# predict test 1/ review 6
prob_pos = df_nbc_pos["prob_pos"][df_nbc.T[6] != 0].product() * prob_test_pos
prob_neg = df_nbc_neg["prob_neg"][df_nbc.T[6] != 0].product() * prob_test_neg
if prob_pos >= prob_neg:
    print("prediction: class 1, positive")
else:
    print("prediction: class 0, negative")

prediction: class 1, positive


In [267]:
df_nbc_pos["prob_pos"][df_nbc.T[6] != 0]

action    0.042553
amaz      0.042553
movi      0.042553
think     0.021277
Name: prob_pos, dtype: float64

In [268]:
df_nbc_neg["prob_neg"][df_nbc.T[6] != 0]

action    0.043478
amaz      0.021739
movi      0.065217
think     0.021739
Name: prob_neg, dtype: float64

In [265]:
# predict test 2/ review 6
prob_pos = df_nbc_pos["prob_pos"][df_nbc.T[7] != 0].product() * prob_test_pos
prob_neg = df_nbc_neg["prob_neg"][df_nbc.T[7] != 0].product() * prob_test_neg
if prob_pos >= prob_neg:
    print("prediction: class 1, positive")
else:
    print("prediction: class 0, negative")

prediction: class 0, negative


In [269]:
df_nbc_pos["prob_pos"][df_nbc.T[7] != 0]

like      0.021277
realli    0.021277
Name: prob_pos, dtype: float64

In [271]:
df_nbc_neg["prob_neg"][df_nbc.T[7] != 0]

like      0.065217
realli    0.043478
Name: prob_neg, dtype: float64

Comment: Based on the trained Naive Bayes Classifier, test 1 (review 6) should be a positive reivew (class 1). However, test 2 (review 7) should be a negative review. Although test 2 looks like a positive review , its words, "really" and "like", only appear in the negative reviews. So, this problem can be solved by increasing inputs and decrasing the coincidence.