## Importing the libaries and packages

In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

## Reading the data and analyzing it

In [2]:
df=pd.read_csv('imdb_labelled.txt',sep='\t')

In [3]:
df.head()

Unnamed: 0,Sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
print(f'Input data has {len(df)} rows, {len(df.columns)} columns')

Input data has 748 rows, 2 columns


In [5]:
df['sentiment'].value_counts()

1    386
0    362
Name: sentiment, dtype: int64

In [6]:
print(f"Numbers of missing lebel={df['sentiment'].isnull().sum()}")
print(f"Numbers of missing sentence={df['Sentence'].isnull().sum()}")

Numbers of missing lebel=0
Numbers of missing sentence=0


## Converting all the sentences to lowercase

In [7]:
df['Sentence'] = df['Sentence'].str.lower()
df.head()

Unnamed: 0,Sentence,sentiment
0,"a very, very, very slow-moving, aimless movie ...",0
1,not sure who was more lost - the flat characte...,0
2,attempting artiness with black & white and cle...,0
3,very little music or anything to speak of.,0
4,the best scene in the movie was when gerardo i...,1


## Removing punctuations from the sentences

In [8]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [10]:
df['Sentence'] = df['Sentence'].apply(remove_punctuations)
df.head()

Unnamed: 0,Sentence,sentiment
0,a very very very slowmoving aimless movie abou...,0
1,not sure who was more lost the flat character...,0
2,attempting artiness with black white and clev...,0
3,very little music or anything to speak of,0
4,the best scene in the movie was when gerardo i...,1


## Removing stop words

In [11]:
stop = stopwords.words('english')
print(stop)
len(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

179

In [12]:
df['Sentence'] = df['Sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,Sentence,sentiment
0,slowmoving aimless movie distressed drifting y...,0
1,sure lost flat characters audience nearly half...,0
2,attempting artiness black white clever camera ...,0
3,little music anything speak,0
4,best scene movie gerardo trying find song keep...,1


## Tokenization (not reqd.)

In [13]:
#not required as vectorization take care of it
import re
def tokenize(txt):
    tokens=re.split('\W+',txt)
    return tokens

df['tokenized']=df['Sentence'].apply(tokenize)
df.head()

Unnamed: 0,Sentence,sentiment,tokenized
0,slowmoving aimless movie distressed drifting y...,0,"[slowmoving, aimless, movie, distressed, drift..."
1,sure lost flat characters audience nearly half...,0,"[sure, lost, flat, characters, audience, nearl..."
2,attempting artiness black white clever camera ...,0,"[attempting, artiness, black, white, clever, c..."
3,little music anything speak,0,"[little, music, anything, speak]"
4,best scene movie gerardo trying find song keep...,1,"[best, scene, movie, gerardo, trying, find, so..."


## Lemmatization

In [14]:
wn=nltk.WordNetLemmatizer()

In [15]:
print(wn.lemmatize('geese'))

goose


In [16]:
def lem(txt):
    words = txt.split() 
    words = [wn.lemmatize(i) for i in words]
    res = ' '.join(words)
    return res

In [17]:
df['lemmatized']=df['Sentence'].apply(lem)
df.head()

Unnamed: 0,Sentence,sentiment,tokenized,lemmatized
0,slowmoving aimless movie distressed drifting y...,0,"[slowmoving, aimless, movie, distressed, drift...",slowmoving aimless movie distressed drifting y...
1,sure lost flat characters audience nearly half...,0,"[sure, lost, flat, characters, audience, nearl...",sure lost flat character audience nearly half ...
2,attempting artiness black white clever camera ...,0,"[attempting, artiness, black, white, clever, c...",attempting artiness black white clever camera ...
3,little music anything speak,0,"[little, music, anything, speak]",little music anything speak
4,best scene movie gerardo trying find song keep...,1,"[best, scene, movie, gerardo, trying, find, so...",best scene movie gerardo trying find song keep...


## Vectorization of data (TF-IDF)

In [18]:

v = TfidfVectorizer()
x = v.fit_transform(df['lemmatized']).toarray()

In [19]:
df1 = pd.DataFrame(x, columns=v.get_feature_names())
df1.head(10)


Unnamed: 0,010,10,1010,110,12,15,18th,1928,1947,1948,...,younger,youre,youthful,youtube,youve,yun,zillion,zombie,zombiestudents,zombiez
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
print(x)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [21]:
y=df['sentiment'].values

In [22]:
print(y)

[0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1
 1 1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1
 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 0 0 1 

## Splitting the data

In [23]:

X_train, Xtest, Y_train, Y_test = train_test_split(x,y,test_size=0.20,random_state=0)

In [24]:
X_train.shape


(598, 2846)

In [25]:
Xtest.shape

(150, 2846)

In [26]:
Y_train.shape

(598,)

In [27]:
Y_test.shape

(150,)

## Training and testing the model (classifier used - Naive Bayes)

In [28]:

nb=MultinomialNB()

In [29]:
nb.fit(X_train,Y_train)

MultinomialNB()

In [30]:
nb.score(Xtest,Y_test) #accuracy 76%

0.7666666666666667

## Confusion Matrix

In [31]:
Y_pred=nb.predict(Xtest)

In [32]:

confusion_m=confusion_matrix(Y_test,Y_pred)

In [33]:
print(confusion_m)

[[58 26]
 [ 9 57]]


In [34]:
# Correctly predicted = 58+26 = 115 (out of 150 test samples)