# Movie Review Sentiment Analysis (NLP)

Import necessary libraries

In [46]:
# data analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

# nlp packages
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords

# sklearn for modeling
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import label_binarize

# sklearn for evaluation
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, roc_curve, auc
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score


In [3]:
df = pd.read_csv("./data/IMDB-Dataset.csv")

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

We have a balanced dataset therefore we do not need to use oversampling or undersampling techniques.

In [6]:
df.review.isnull().sum()

0

In [7]:
len(df)

50000

No missing values in review and sentiment label.

### Binarizing sentiment column to 0 and 1 for binary output.

In [8]:
#binarizing emotion column
emotion_dict = {'negative': 0, 'positive': 1}
df['sentiment'] = df['sentiment'].map(emotion_dict)

In [9]:
df['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

### Exploratory Data Analysis

In [11]:
Review_Positives = df [ (df['sentiment'] == 1) ]
Review_Positives

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
...,...,...
49983,"I loved it, having been a fan of the original ...",1
49985,Imaginary Heroes is clearly the best film of t...,1
49989,I got this one a few weeks ago and love it! It...,1
49992,John Garfield plays a Marine who is blinded by...,1


1. lower-cased

In [14]:
Review_Positives['review'].str.lower()

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
4        petter mattei's "love in the time of money" is...
5        probably my all-time favorite movie, a story o...
                               ...                        
49983    i loved it, having been a fan of the original ...
49985    imaginary heroes is clearly the best film of t...
49989    i got this one a few weeks ago and love it! it...
49992    john garfield plays a marine who is blinded by...
49995    i thought this movie did a down right good job...
Name: review, Length: 25000, dtype: object

2. tokenize

In [23]:
corpus_review_pos = Review_Positives['review'].to_list()

In [39]:
def tokenize_reviews(corpus):
    tokens = []
    for review in corpus:
        tokens.extend(word_tokenize(review))
    return tokens

In [40]:
tokens_reviews_pos = tokenize_reviews(corpus_review_pos)

In [44]:
freq = FreqDist(tokens_reviews_pos)
freq.most_common(30)

[('the', 339235),
 (',', 282003),
 ('.', 230821),
 ('and', 175937),
 ('a', 163442),
 ('of', 151671),
 ('to', 130801),
 ('is', 114193),
 ('in', 98409),
 ('/', 98198),
 ('>', 98073),
 ('<', 97994),
 ('br', 97954),
 ('it', 93697),
 ('i', 80970),
 ('that', 69345),
 ('this', 69238),
 ("'s", 63203),
 ('as', 50999),
 ('with', 45613),
 ('was', 44849),
 ('for', 44014),
 ('film', 40765),
 ('but', 40534),
 (')', 36776),
 ('movie', 36706),
 ('(', 36182),
 ('his', 33615),
 ('you', 33279),
 ('on', 33149)]

Stopwords and punctuations are the most frequent words

### Remove Stopwords and punctuations

In [47]:
stop_lst = stopwords.words('english')
stop_lst += list(string.punctuation)
# write a function to remove stopwords and punctuations

def remove_stopwords(tokens, stop_list = stop_lst):
    
    
    #encoding/decoding tokens to eliminate unrecognized symbols and 
    #eliminating external links
    
    tokens_stopped = [word.encode('ascii','ignore').decode() 
                      for word in tokens 
                      if (word not in stop_list) & 
                      (word.startswith('http') == False)]
    

    return tokens_stopped

In [51]:
stop_lst += ['br',"'s",'film','films','movies', 'movie',"''", '``', "n't", "...", ]

In [52]:
tokens_reviews_pos_nostopwords = remove_stopwords(tokens_reviews_pos, stop_lst)

In [53]:
freq = FreqDist(tokens_reviews_pos_nostopwords)
freq.most_common(30)

[('one', 26537),
 ('like', 17232),
 ('good', 14492),
 ('great', 12811),
 ('story', 12531),
 ('see', 11978),
 ('time', 11867),
 ('would', 11251),
 ('well', 11231),
 ('also', 10727),
 ('really', 10713),
 ('even', 9575),
 ('much', 9067),
 ('first', 8979),
 ('people', 8609),
 ('love', 8455),
 ('best', 8351),
 ('get', 8206),
 ('way', 7619),
 ('many', 7597),
 ('life', 7526),
 ('could', 7176),
 ('think', 7167),
 ('made', 7042),
 ('characters', 6963),
 ('two', 6942),
 ('character', 6755),
 ('seen', 6640),
 ('show', 6575),
 ('watch', 6554)]