In [1]:
import nltk

In [2]:
import pandas as pd

In [3]:
#Reading the dataset to memory
data_frame=pd.read_csv("Dataset/IMDB Dataset.csv")

In [4]:
#Columns in my Dataframe
print(list(data_frame))

['review', 'sentiment']


In [5]:
data_frame

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
#Making all review to lowercase
data_frame['review'] = data_frame['review'].str.lower()

In [7]:
data_frame

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
49995,i thought this movie did a down right good job...,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,i am a catholic taught in parochial elementary...,negative
49998,i'm going to have to disagree with the previou...,negative


In [8]:
#Function to split string to tokens
def identify_tokens(row):
    tokens = nltk.word_tokenize(row)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [9]:
#Tokenization of DataFrame
data_frame['words'] = data_frame["review"].apply(identify_tokens)

In [10]:
data_frame

Unnamed: 0,review,sentiment,words
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. <br /><br />the...,positive,"[a, wonderful, little, production, br, br, the..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,i'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ..."


In [11]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

In [12]:
#Funtion for stemming the list of words
def stem_list(row):
    stemmed_list = [stemming.stem(word) for word in row]
    return (stemmed_list)

In [13]:
#Stemming of the dataframe
data_frame['stemmed_words'] = data_frame["words"].apply(stem_list)

In [14]:
data_frame

Unnamed: 0,review,sentiment,words,stemmed_words
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, review, ha, mention, tha..."
1,a wonderful little production. <br /><br />the...,positive,"[a, wonderful, little, production, br, br, the...","[a, wonder, littl, product, br, br, the, film,..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, thi, wa, a, wonder, way, to, spen..."
3,basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little...","[basic, there, a, famili, where, a, littl, boy..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, in, the, time, of, mone..."
...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,...","[i, thought, thi, movi, did, a, down, right, g..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el...","[i, am, a, cathol, taught, in, parochi, elemen..."
49998,i'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ...","[i, go, to, have, to, disagre, with, the, prev..."


In [15]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [16]:
#Function to remove stop words from the list of words
def remove_stops(row):
    meaningful_words = [w for w in row if not w in stops]
    return (meaningful_words)

In [17]:
#Removing stopwords from the Dataframe
data_frame['refined_review'] = data_frame["stemmed_words"].apply(remove_stops)

In [18]:
data_frame

Unnamed: 0,review,sentiment,words,stemmed_words,refined_review
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, review, ha, mention, tha...","[one, review, ha, mention, watch, oz, episod, ..."
1,a wonderful little production. <br /><br />the...,positive,"[a, wonderful, little, production, br, br, the...","[a, wonder, littl, product, br, br, the, film,...","[wonder, littl, product, br, br, film, techniq..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, thi, wa, a, wonder, way, to, spen...","[thought, thi, wa, wonder, way, spend, time, h..."
3,basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little...","[basic, there, a, famili, where, a, littl, boy...","[basic, famili, littl, boy, jake, think, zombi..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, time, money, visual, st..."
...,...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,...","[i, thought, thi, movi, did, a, down, right, g...","[thought, thi, movi, right, good, job, wa, cre..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir...","[bad, plot, bad, dialogu, bad, act, idiot, dir..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el...","[i, am, a, cathol, taught, in, parochi, elemen...","[cathol, taught, parochi, elementari, school, ..."
49998,i'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ...","[i, go, to, have, to, disagre, with, the, prev...","[go, disagre, previou, comment, side, maltin, ..."


In [19]:
#Function to join the processed words
def rejoin_words(row):
    joined_words = ( " ".join(row))
    return joined_words

In [20]:
#Joining the processed words in the data_frame
data_frame['processed'] = data_frame["refined_review"].apply(rejoin_words)

In [21]:
data_frame

Unnamed: 0,review,sentiment,words,stemmed_words,refined_review,processed
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, review, ha, mention, tha...","[one, review, ha, mention, watch, oz, episod, ...",one review ha mention watch oz episod hook rig...
1,a wonderful little production. <br /><br />the...,positive,"[a, wonderful, little, production, br, br, the...","[a, wonder, littl, product, br, br, the, film,...","[wonder, littl, product, br, br, film, techniq...",wonder littl product br br film techniqu veri ...
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, thi, wa, a, wonder, way, to, spen...","[thought, thi, wa, wonder, way, spend, time, h...",thought thi wa wonder way spend time hot summe...
3,basically there's a family where a little boy ...,negative,"[basically, there, a, family, where, a, little...","[basic, there, a, famili, where, a, littl, boy...","[basic, famili, littl, boy, jake, think, zombi...",basic famili littl boy jake think zombi hi clo...
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, in, the, time, of, mone...","[petter, mattei, love, time, money, visual, st...",petter mattei love time money visual stun film...
...,...,...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,...","[i, thought, thi, movi, did, a, down, right, g...","[thought, thi, movi, right, good, job, wa, cre...",thought thi movi right good job wa creativ ori...
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogu, bad, act, idiot, dir...","[bad, plot, bad, dialogu, bad, act, idiot, dir...",bad plot bad dialogu bad act idiot direct anno...
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el...","[i, am, a, cathol, taught, in, parochi, elemen...","[cathol, taught, parochi, elementari, school, ...",cathol taught parochi elementari school nun ta...
49998,i'm going to have to disagree with the previou...,negative,"[i, going, to, have, to, disagree, with, the, ...","[i, go, to, have, to, disagre, with, the, prev...","[go, disagre, previou, comment, side, maltin, ...",go disagre previou comment side maltin thi one...


In [22]:
#x contains review and y contain sentiment
x=data_frame.iloc[:,5].values
y=data_frame.iloc[:,1].values

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
#Splitting into training and testing data
x_train,y_train,x_test,y_test=train_test_split(x,y,test_size=0.33)

In [25]:
y_train.shape

(16500,)

In [26]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
model=Pipeline([
    ('tfdif',TfidfVectorizer()),
    ('classifier',MultinomialNB())
])

In [28]:
#Training the model
model.fit(x_train,x_test)

Pipeline(memory=None,
         steps=[('tfdif',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

# Validation of the model with new reviews

In [29]:
model.predict(["This movie is just crap. Even though the directors claim to be part of that oi-culture, it's still a very, very bad directorial debut. The topic itself is very interesting and I accept the bad acting ..."])

array(['negative'], dtype='<U8')

In [30]:
model.predict(["this movie is quite bad, aggressive, not played well, not directed well, seems low budget, low quality,emotionaly weak and disconnected."])

array(['negative'], dtype='<U8')

In [31]:
model.predict(["a perfect film to watch during the holiday season as the winter/Xmas atmosphere that Burton creates for Gotham City is way cool."])

array(['positive'], dtype='<U8')

In [32]:
model.predict(["I like Noel Coward, the wit. I like Noel Coward, the play write. I like Noel Coward, the composer and singer, but I loathe Noel Coward the actor."])

array(['positive'], dtype='<U8')

In [33]:
#Predicting the output
y_pred=model.predict(y_train)

86.12121212121212

In [34]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [35]:
accuracy_score(y_test,y_pred)*100

86.12121212121212

In [36]:
confusion_matrix(y_test,y_pred)

array([[7218, 1032],
       [1258, 6992]], dtype=int64)

In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

    negative       0.85      0.87      0.86      8250
    positive       0.87      0.85      0.86      8250

    accuracy                           0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500

