# Import Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle
import string
string.punctuation
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import Dataset

In [2]:
train_df = pd.read_csv('./movie-review-dataset/train.csv')
valid_df = pd.read_csv('./movie-review-dataset/valid.csv')
test_df = pd.read_csv('./movie-review-dataset/test.csv')

In [3]:
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

(40000, 2)
(5000, 2)
(5000, 2)


In [4]:
print(train_df.isnull().sum())
print(valid_df.isnull().sum())
print(test_df.isnull().sum())

text     0
label    0
dtype: int64
text     0
label    0
dtype: int64
text     0
label    0
dtype: int64


# Analysis

In [5]:
print(train_df.label.value_counts())
print(valid_df.label.value_counts())

0    20019
1    19981
Name: label, dtype: int64
1    2514
0    2486
Name: label, dtype: int64


In [6]:
X_train_val=pd.concat([train_df,valid_df])

# Preprocessing

In [7]:
from textblob import Word

In [8]:
def text_cleaning(df):
    df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df['text'] = df['text'].str.replace('[^\w\s]','')  #punctuations
    df['text'] = df['text'].str.replace('\d','') #numbers
    df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords.words('english')))
    sil = pd.Series(' '.join(df['text']).split()).value_counts()[-1000:] #rare characters deleting
    df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
    df['text'] = df['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 
    return df

In [9]:
train_df=text_cleaning(train_df)
train_df.head()

  df['text'] = df['text'].str.replace('[^\w\s]','')  #punctuations
  df['text'] = df['text'].str.replace('\d','') #numbers


In [None]:
valid_df=text_cleaning(valid_df)
print(valid_df.head())
test_df=text_cleaning(test_df)
print(test_df.head())

  df['text'] = df['text'].str.replace('[^\w\s]','')  #punctuations
  df['text'] = df['text'].str.replace('\d','') #numbers


                                                text  label
0  year since sharon stone awarded viewer legcros...      0
1  someone needed make car payment truly awful ma...      0
2  guideline state comment must contain minimum f...      0
3  movie muddled mishmash clichés recent cinema p...      0
4  stan laurel became smaller half alltime greate...      0


  df['text'] = df['text'].str.replace('[^\w\s]','')  #punctuations
  df['text'] = df['text'].str.replace('\d','') #numbers


                                                text  label
0  always wrote series complete stinkfest jim bel...      0
1  st watched dirsteve purcell typical mary kate ...      0
2  movie poorly written directed fell asleep minu...      0
3  interesting thing miryang secret sunshine acto...      1
4  first read berlin meer didnt expect much thoug...      0


In [None]:
train_x=train_df['text']
train_y=train_df['label']

valid_x=train_df['text']
valid_y=train_df['label']

In [None]:
count_vect=CountVectorizer()
count_vect.fit(train_x)

In [None]:
x_train_vect = tfidf_vect.transform(train_x)
x_valid_vect = tfidf_vect.transform(valid_x)
x_test_vect  = tfidf_vect.transform(test_df["text"])

In [61]:
x_train_vect

<40000x144329 sparse matrix of type '<class 'numpy.int64'>'
	with 3902679 stored elements in Compressed Sparse Row format>

# MultinomialNB

In [95]:
mnb = MultinomialNB()
mnb.fit(x_train_vect, train_y)

MultinomialNB()

In [96]:
y_pred_MNB = mnb.predict(x_test_vect)

In [97]:
Acc_MNB = mnb.score(x_train_vect, train_y)
acc_MNB = mnb.score(x_test_vect, test_y)
print ('Train Accuracy : {:.2f}%'.format(Acc_MNB*100))
print ('Test Accuracy : {:.2f}%'.format(acc_MNB*100))

Train Accuracy : 91.35%
Test Accuracy : 86.74%


In [98]:
print(classification_report(test_y,y_pred_MNB))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2495
           1       0.88      0.85      0.87      2505

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



# Linear SVC

In [99]:
Lsvc=LinearSVC()
Lsvc.fit(x_train_vect, train_y)

LinearSVC()

In [100]:
Acc_Lsvc = Lsvc.score(x_train_vect, train_y)
acc_Lsvc = Lsvc.score(x_test_vect, test_y)
print ('Train Accuracy : {:.2f}%'.format(Acc_Lsvc*100))
print ('Test Accuracy : {:.2f}%'.format(acc_Lsvc*100))

Train Accuracy : 98.95%
Test Accuracy : 89.82%


In [102]:
y_pred_Lsvc = Lsvc.predict(x_test_vect)
print(classification_report(test_y, y_pred_Lsvc))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2495
           1       0.89      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



# Neural Network

In [29]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [30]:
model = Sequential() 
#layers
model.add(Dense(50,input_dim=x_train_count.shape[1], kernel_initializer="uniform", activation="relu")) 
#model.add(Dense(6, kernel_initializer="uniform", activation="relu")) 
model.add(Dense(1, kernel_initializer="uniform", activation="sigmoid")) 
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
model.summary()

In [32]:
history = model.fit(x_train_count, train_y.values.reshape(-1,1), 
validation_data=(x_valid_count,valid_y), nb_epoch=2, batch_size=128)

(40000, 144329)

In [None]:
loss, acc_NN = model.evaluate(x_test_vect, test_df["label"], verbose=0)
print('Test Accuracy: %f' % (acc_NN*100))

# Final Report

In [103]:
output = pd.DataFrame({"Model":['MultinomialNB','Linear SVC','NN'],
                      "Accuracy":[acc_MNB, acc_Lsvc,acc_NN]})
output

Unnamed: 0,Model,Accuracy
0,MultinomialNB,0.8674
1,Linear SVC,0.8982


In [82]:
output = pd.DataFrame({"Model":['MultinomialNB','Linear SVC'],
                      "Accuracy":[acc_MNB, acc_Lsvc]})
output

Unnamed: 0,Model,Accuracy
0,MultinomialNB,0.8628
1,Linear SVC,0.875


# Saving Model

In [73]:
pickle.dump(data_vectorizer, open('transform.pkl', 'wb'))

In [85]:
pickle.dump(mnb, open('mnb.pkl', 'wb'))

In [84]:
pickle.dump(Lsvc, open('lsvc.pkl', 'wb'))

# Testing

In [86]:
mov_list=np.array(["very bad bas dbad "])

In [87]:
vect=pickle.load(open('transform.pkl','rb'))

In [88]:
arr=vect.transform(mov_list)
arr

<1x144329 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [89]:
clf=pickle.load(open('mnb.pkl', 'rb'))

In [90]:
clf.predict(arr)

array([0], dtype=int64)

In [91]:
lsvc=pickle.load(open('LSVC.pkl','rb'))

In [92]:
lsvc.predict(arr)

array([0], dtype=int64)