# Sentiment analysis for Movie IMDB dataset

In [163]:
import re
import nltk
import string
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Read dataset

In [162]:
data = pd.read_csv("IMDB_Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [160]:
data['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [161]:
data['sentiment'][1]

'positive'

## Preprocessing of text

In [23]:
def pretty_print(index):
    print(labels[index] + "\t:\t" + reviews[index][:100] + "...") 

In [27]:
print("labels \t : \t reviews\n")
pretty_print(200)
pretty_print(2000)
pretty_print(654)
pretty_print(897)
pretty_print(1)
pretty_print(666)

labels 	 : 	 reviews

negative	:	Interesting and short television movie describes some of the machinations surrounding Jay Leno's rep...
negative	:	Stranded in Space (1972) MST3K version - a very not good TV movie pilot, for a never to be made seri...
negative	:	Most Lorne Michaels films seem to fail because they're essentially just extended versions of skits t...
positive	:	I managed to see this at the New York International Film Festival in November 2005 with my boyfriend...
positive	:	A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-B...
positive	:	This was a fine example of how an interesting film can be made without using big stars and big effec...


In [29]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [30]:
for i in range(len(reviews)):
    if labels[i]=="positive":
        for word in reviews[i].split(" "):
            positive_counts[word] += 1
            total_counts[word] += 1
            
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] += 1
            total_counts[word] += 1
        

In [31]:
# check positive number counts
# observation: many stopwords we dont need for sentiment analysis, normalization necessary
positive_counts.most_common()

[('the', 291924),
 ('and', 165931),
 ('a', 155773),
 ('of', 149175),
 ('to', 128343),
 ('is', 108164),
 ('in', 90424),
 ('that', 62445),
 ('I', 62219),
 ('it', 53368),
 ('this', 51611),
 ('/><br', 48976),
 ('as', 46573),
 ('with', 43289),
 ('was', 42154),
 ('for', 41075),
 ('The', 32894),
 ('but', 32333),
 ('his', 31695),
 ('on', 30604),
 ('film', 28855),
 ('are', 28365),
 ('movie', 26367),
 ('you', 24927),
 ('not', 24815),
 ('have', 23990),
 ('be', 23149),
 ('he', 22817),
 ('by', 22706),
 ('an', 21458),
 ('one', 21304),
 ('at', 20407),
 ('who', 20052),
 ('from', 19604),
 ('all', 17725),
 ('has', 17679),
 ('her', 16754),
 ('like', 15665),
 ('about', 15416),
 ('very', 15174),
 ('they', 15126),
 ('so', 14551),
 ('or', 13997),
 ('more', 13349),
 ('out', 13326),
 ('some', 12906),
 ('just', 12876),
 ('This', 12405),
 ('their', 11803),
 ('when', 11757),
 ('what', 11690),
 ('It', 11326),
 ('good', 11318),
 ('which', 10839),
 ('see', 10693),
 ('my', 10500),
 ('can', 10493),
 ('great', 10331),


## Transform text to Numbers

In [32]:
# create vocabulary
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

439838


In [33]:
# create an empty vector first as a best practice then add inputs
layer_0 = np.zeros((1,vocab_size))
layer_0

array([[0., 0., 0., ..., 0., 0., 0.]])

In [34]:
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
word2index

{'Hannah': 0,
 'Luck,': 1,
 'Mercifully': 2,
 'Fumiya': 3,
 'kaante': 4,
 'Empire!': 5,
 '(WWII).': 6,
 'Gretta': 7,
 'Bryan,': 8,
 '"elegant",': 9,
 'noir.<br': 10,
 'Brotheresque': 11,
 'VERSIONS)': 12,
 'blindfolded?"': 13,
 'daunting.': 14,
 '"Gremlins,"': 15,
 'Campy.': 16,
 '(Airplane!),': 17,
 'slash-and-burn': 18,
 '$4.00.': 19,
 "Roberts'": 20,
 'shrank': 21,
 'though....probably': 22,
 "'romantic": 23,
 'half-good': 24,
 'Freezes.': 25,
 'tough-cookie': 26,
 'Kar-Wai;': 27,
 'combat",': 28,
 "superb--he's": 29,
 "'virtues'": 30,
 'employees.<br': 31,
 "'wasteland'": 32,
 'Audiard)': 33,
 'Lester(Bernard': 34,
 'scientist,has': 35,
 'BATMAN;': 36,
 'xenophobia': 37,
 'mexico': 38,
 'while...Without': 39,
 'underfunded': 40,
 'night,prior': 41,
 'recruitment,': 42,
 'Matinees': 43,
 'Winkleman?': 44,
 'Senator.': 45,
 'tinglingly': 46,
 'presidency.<br': 47,
 'exception': 48,
 'setting:': 49,
 'miss?"': 50,
 '"Love")': 51,
 'this,': 52,
 'kneaded': 53,
 'breasts?': 54,
 'it...n

In [164]:
# remove urls
def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

# remove html
def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

# remove punctuations
def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

data['review'] = data['review'].map(lambda x: remove_url(x))
data['review'] = data['review'].map(lambda x: remove_html(x))
data['review'] = data['review'].map(lambda x: remove_punct(x))


In [165]:
stop = set(stopwords.words("english"))
tokenizer = ToktokTokenizer()

# remove stopwords
def remove_stop(text):
    tokens = tokenizer.tokenize(text)
    text = [token for token in tokens if token.lower() not in stop]
    return " ".join(text)
data['review'] = data['review'].map(remove_stop)
data['review']

0        One reviewers mentioned watching 1 Oz episode ...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        Basically theres family little boy Jake thinks...
4        Petter Matteis Love Time Money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    Catholic taught parochial elementary schools n...
49998    Im going disagree previous comment side Maltin...
49999    one expects Star Trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

In [166]:
# split data into train and test
train_x = data.review[:40000]
test_x = data.review[40000:]

## Convert text to Count vectorizer and TF-IDF features

In [167]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,2))

# transformed train and test reviews into bag of words features
cv_train_reviews = cv.fit_transform(train_x)
cv_test_reviews = cv.transform(test_x)

In [168]:
print("BOW shape of train:", cv_train_reviews.shape)
print("BOW shape of test:", cv_test_reviews.shape)

BOW shape of train: (40000, 2411588)
BOW shape of test: (10000, 2411588)


In [169]:
# tfidf vectorizer
tfidf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,2))

# transformed train and test reviews into tfidf features
tfidf_train = tfidf.fit_transform(train_x)
tfidf_test = tfidf.transform(test_x)


In [170]:
print("TFidf shape of train:", tfidf_train.shape)
print("TFidf shape of test:", tfidf_test.shape)

TFidf shape of train: (40000, 2411588)
TFidf shape of test: (10000, 2411588)


In [171]:
# Convert categorical labels "positive" and "negative" into "1" and "0"
lb = LabelBinarizer()
sentiment_labels = lb.fit_transform(data['sentiment'])
sentiment_labels.shape

(50000, 1)

In [173]:
# split train and test labels
train_labels = sentiment_labels[:40000]
test_labels = sentiment_labels[40000:]
train_labels

array([[1],
       [1],
       [1],
       ...,
       [1],
       [0],
       [0]])

## Logistic Regression for bag of words

In [174]:
# training the logistic regression model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

# Count vectorizer model with bag of words
lr_bow_model = lr.fit(cv_train_reviews,train_labels.ravel())
print(lr_bow_model)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [175]:
##Predicting the model for bow features
lr_bow_predict = lr.predict(cv_test_reviews)
print(lr_bow_predict)

[0 0 0 ... 0 0 0]


In [176]:
# accuracy score for bow model
score_bow_model = accuracy_score(test_labels, lr_bow_predict)
score_bow_model

0.6775

In [177]:
# classification report
lr_bow_report = classification_report(test_labels,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

    Positive       0.63      0.83      0.72      4993
    Negative       0.76      0.52      0.62      5007

    accuracy                           0.68     10000
   macro avg       0.70      0.68      0.67     10000
weighted avg       0.70      0.68      0.67     10000



## Logistic Regression for TF-IDF features

In [178]:
# fir train data to regressor
lr_tfidf_model = lr.fit(tfidf_train,train_labels.ravel())
print(lr_tfidf_model)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [179]:
# Predicting the model for tfidf features
lr_tfidf_predict = lr.predict(tfidf_test)
print(lr_tfidf_predict)

[0 0 0 ... 0 0 0]


In [180]:
# accuracy score for tfidf features
score_tfidf_model = accuracy_score(test_labels, lr_tfidf_predict)
score_tfidf_model

0.7072

In [181]:
# classification report
lr_tfidf_report = classification_report(test_labels,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.70      0.72      0.71      4993
    Negative       0.71      0.69      0.70      5007

    accuracy                           0.71     10000
   macro avg       0.71      0.71      0.71     10000
weighted avg       0.71      0.71      0.71     10000

