In [8]:
!ls -lha

total 17M
drwxr-xr-x 1 root root 4.0K Feb 13 06:57 .
drwxr-xr-x 1 root root 4.0K Feb 13 06:52 ..
drwxr-xr-x 4 root root 4.0K Feb  9 14:20 .config
drwx------ 5 root root 4.0K Feb 13 06:55 drive
-rw-r--r-- 1 root root  16M Feb 13 07:01 IMDB-Dataset.csv
drwxr-xr-x 1 root root 4.0K Feb  9 14:20 sample_data


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
# Import the necessary libraries

import numpy as np
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [92]:
# read the csv file (i.e. the dataset)

data = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis on Movie Reviews with Binary Classification/IMDB-Dataset.csv')

In [93]:
print("Shape:", data.shape)

Shape: (50000, 2)


In [94]:
# Show the first 10 rows of dataset

data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [95]:
# Show the brief information of the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [96]:
# Show the statistical summary of the data

data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [97]:
# Count the number of positive comments & negative comments respectively

data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [98]:
# Replace the positive comments with the label "1"
# Replace the negative comments with the label "0"

data.sentiment.replace('positive', 1, inplace=True)
data.sentiment.replace('negative', 0, inplace=True)

In [99]:
# Show the first 10 rows of data after processing

data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [100]:
# Show the 101th comments

data.review[100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

**Now we are goint to do some preprocessing on the dataset**

1. Remove HTML tags
2. Remove special characters and symbols
3. Convert all characters to lowercase  
4. Remove stopwords
5. Stemming/Lemmatization (Choose 1 out of 2)

*(Note: Usually we do stemming as stemming is faster than lemmatization, but lemmatization is more accurate in processing the words, so we give the user to choose which one they want to do.)*

***By default, I use Stemming since it is faster in processing than lemmatization***

In [101]:
# 1. Remove HTML tags

def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned,'',text)

data.review = data.review.apply(clean)
data.review[100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

In [102]:
# 2. Remove special characters and symbols

def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

data.review = data.review.apply(is_special)
data.review[100]

'This short film that inspired the soon to be full length feature   Spatula Madness   is a hilarious piece that contends against similar cartoons yielding multiple writers  The short film stars Edward the Spatula who after being fired from his job  joins in the fight against the evil spoons  This premise allows for some funny content near the beginning  but is barely present for the remainder of the feature  This film s 15 minute running time is absorbed by some odd ball comedy and a small musical number  Unfortunately not much else lies below it  The plot that is set up doesn t really have time to show  But it s surely follows it plot better than many high budget Hollywood films  This film is worth watching at least a few times  Take it for what it is  and don t expect a deep story '

In [103]:
# 3. Convert all characters to lowercase

def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)
data.review[100]

'this short film that inspired the soon to be full length feature   spatula madness   is a hilarious piece that contends against similar cartoons yielding multiple writers  the short film stars edward the spatula who after being fired from his job  joins in the fight against the evil spoons  this premise allows for some funny content near the beginning  but is barely present for the remainder of the feature  this film s 15 minute running time is absorbed by some odd ball comedy and a small musical number  unfortunately not much else lies below it  the plot that is set up doesn t really have time to show  but it s surely follows it plot better than many high budget hollywood films  this film is worth watching at least a few times  take it for what it is  and don t expect a deep story '

In [104]:
# 4. Remove stopwords

def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data.review = data.review.apply(rem_stopwords)
data.review[100]

['short',
 'film',
 'inspired',
 'soon',
 'full',
 'length',
 'feature',
 'spatula',
 'madness',
 'hilarious',
 'piece',
 'contends',
 'similar',
 'cartoons',
 'yielding',
 'multiple',
 'writers',
 'short',
 'film',
 'stars',
 'edward',
 'spatula',
 'fired',
 'job',
 'joins',
 'fight',
 'evil',
 'spoons',
 'premise',
 'allows',
 'funny',
 'content',
 'near',
 'beginning',
 'barely',
 'present',
 'remainder',
 'feature',
 'film',
 '15',
 'minute',
 'running',
 'time',
 'absorbed',
 'odd',
 'ball',
 'comedy',
 'small',
 'musical',
 'number',
 'unfortunately',
 'much',
 'else',
 'lies',
 'plot',
 'set',
 'really',
 'time',
 'show',
 'surely',
 'follows',
 'plot',
 'better',
 'many',
 'high',
 'budget',
 'hollywood',
 'films',
 'film',
 'worth',
 'watching',
 'least',
 'times',
 'take',
 'expect',
 'deep',
 'story']

In [106]:
# 5(a). Stemming

# Let's use stemming as default

def stem_txt(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

data.review = data.review.apply(stem_txt)
data.review[100]

'short film inspir soon full length featur spatula mad hilari piec contend similar cartoon yield multipl writer short film star edward spatula fire job join fight evil spoon premis allow funni content near begin bare present remaind featur film 15 minut run time absorb odd ball comedi small music number unfortun much els lie plot set realli time show sure follow plot better mani high budget hollywood film film worth watch least time take expect deep stori'

In [107]:
data.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1
5,probabl time favorit movi stori selfless sacri...,1
6,sure would like see resurrect date seahunt ser...,1
7,show amaz fresh innov idea 70 first air first ...,0
8,encourag posit comment film look forward watch...,0
9,like origin gut wrench laughter like movi youn...,1


In [108]:
# 5(b). Lemmatization

def lemma_txt(text):
    wnl = WordNetLemmatizer()
    return " ".join([wnl.lemmatize(w) for w in text])

Now, we build the model

1. Creating bag of words (BOW)
2. Train test split
3. Defining the models and Training them
4. Prediction and accuracy metrics to choose best model

In [112]:
# 1. Create bag of words

X = np.array(data.iloc[:,0].values)
y = np.array(data.sentiment.values)
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(data.review).toarray()

print("X.shape = ", X.shape)
print("y.shape = ", y.shape)

X.shape =  (50000, 1000)
y.shape =  (50000,)


In [113]:
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [114]:
print(y)

[1 1 1 ... 0 0 0]


In [116]:
# 2. Train test split

trainx, testx, trainy, testy = train_test_split(X, y, test_size = 0.2, random_state = 9)
print("Train shapes : X = {}, y = {}".format(trainx.shape, trainy.shape))
print("Test shapes : X = {}, y = {}".format(testx.shape, testy.shape))

Train shapes : X = (40000, 1000), y = (40000,)
Test shapes : X = (10000, 1000), y = (10000,)


In [117]:
# 3. Defining the models and Training them

gnb, mnb, bnb = GaussianNB(), MultinomialNB(alpha=1.0,fit_prior=True), BernoulliNB(alpha=1.0,fit_prior=True)

gnb.fit(trainx, trainy)
mnb.fit(trainx, trainy)
bnb.fit(trainx, trainy)

In [118]:
# 4. Prediction and accuracy metrics to choose best model

ypg = gnb.predict(testx)
ypm = mnb.predict(testx)
ypb = bnb.predict(testx)

print("Gaussian = ",accuracy_score(testy,ypg))
print("Multinomial = ",accuracy_score(testy,ypm))
print("Bernoulli = ",accuracy_score(testy,ypb))

Gaussian =  0.7843
Multinomial =  0.831
Bernoulli =  0.8386


In [119]:
pickle.dump(bnb, open('model1.pkl','wb'))

In [125]:
choices = 0

while True:
    input_review = input("Please enter your review (enter e if you want to exit): ")

    if input_review == 'e':
        print("Program terminates")
        break

    choices = int(input("Enter 0 if you want to do stemming, enter 1 if you want to do lemmatization: "))

    f1 = clean(input_review)
    f2 = is_special(f1)
    f3 = to_lower(f2)
    f4 = rem_stopwords(f3)

    if choices == 0:
        f5 = stem_txt(f4)
    else:
        f5 = lemma_txt(f4)


    bow, words = [], word_tokenize(f5)
    for word in words:
        bow.append(words.count(word))

    word_dict = cv.vocabulary_
    pickle.dump(word_dict, open('bow.pkl', 'wb'))

    inp = []
    for i in word_dict:
        inp.append(f5.count(i[0]))
    y_pred = bnb.predict(np.array(inp).reshape(1,1000))

    if y_pred == 0:
        print("This is a negative review")
    else:
        print("This is a positive review")

Please enter your review (enter e if you want to exit): Terrible. Complete trash. Brainless tripe. Insulting to anyone who isn't an 8 year old fan boy. Im actually pretty disgusted that this movie is making the money it is - what does it say about the people who brainlessly hand over the hard earned cash to be 'entertained' in this fashion and then come here to leave a positive 8.8 review?? Oh yes, they are morons. Its the only sensible conclusion to draw. How anyone can rate this movie amongst the pantheon of great titles is beyond me.  So trying to find something constructive to say about this title is hard...I enjoyed Iron Man? Tony Stark is an inspirational character in his own movies but here he is a pale shadow of that...About the only 'hook' this movie had into me was wondering when and if Iron Man would knock Captain America out...Oh how I wished he had :( What were these other characters anyways? Useless, bickering idiots who really couldn't organise happy times in a brewery. 