## Load Libraries

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from keras.models import Sequential 
from keras.layers import Dense 
import matplotlib.pyplot as plt


from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

from warnings import filterwarnings
filterwarnings('ignore')

import numpy as np
import pandas as pd 
seed = 7 
np.random.seed(seed)

<a id = "2"></a><br>
## Load Train Dataset

In [None]:
train = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
valid = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv")
test = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv")

In [2]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [4]:
train.label.value_counts()

0    20019
1    19981
Name: label, dtype: int64

In [5]:
train.groupby("label").count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,20019
1,19981


## Data Cleaning

In [6]:
def transformations(dataframe):
    # upper to lower character
    dataframe['text'] = dataframe['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    #punctuations
    dataframe['text'] = dataframe['text'].str.replace('[^\w\s]','')
    #numbers
    dataframe['text'] = dataframe['text'].str.replace('\d','')
    # 
    sw = stopwords.words('english')
    dataframe['text'] = dataframe['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    #rare characters deleting
    sil = pd.Series(' '.join(dataframe['text']).split()).value_counts()[-1000:]
    dataframe['text'] = dataframe['text'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))
    #lemmi
    from textblob import Word
    #nltk.download('wordnet')
    dataframe['text'] = dataframe['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) 
    return dataframe

In [7]:
train = transformations(train)
train.head()

Unnamed: 0,text,label
0,grew b watching loving thunderbird mate school...,0
1,put movie dvd player sat coke chip expectation...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movie bore...,0
4,im die hard dad army fan nothing ever change g...,1


In [8]:
valid = transformations(valid)
valid.head()

Unnamed: 0,text,label
0,year since sharon stone awarded viewer legcros...,0
1,someone needed make car payment truly awful ma...,0
2,guideline state comment must contain minimum f...,0
3,movie muddled mishmash clichés recent cinema p...,0
4,stan laurel became smaller half alltime greate...,0


In [9]:
test = transformations(test)
test.head()

Unnamed: 0,text,label
0,always wrote series complete stinkfest jim bel...,0
1,st watched dirsteve purcell typical mary kate ...,0
2,movie poorly written directed fell asleep minu...,0
3,interesting thing miryang secret sunshine acto...,1
4,first read berlin meer didnt expect much thoug...,0


In [10]:
train_x = train['text']
valid_x = valid["text"]
train_y = train["label"]
valid_y = valid["label"]

In [72]:
# vectorizer = CountVectorizer()
vectorizer=TfidfVectorizer()
vectorizer.fit(train_x)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [73]:
x_train_count = vectorizer.transform(train_x)
x_valid_count = vectorizer.transform(valid_x)
x_test_count  = vectorizer.transform(test["text"])

## Multinomial DB

In [74]:
mnb = MultinomialNB()
mnb.fit(x_train_count, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [75]:
y_pred_MNB = mnb.predict(x_test_count)

In [76]:
Acc_MNB = mnb.score(x_train_count, train_y)
acc_MNB = mnb.score(x_test_count, test['label'])
print ('Train Accuracy : {:.2f}%'.format(Acc_MNB*100))
print ('Test Accuracy : {:.2f}%'.format(acc_MNB*100))

Train Accuracy : 91.35%
Test Accuracy : 86.74%


In [77]:
print(classification_report(test['label'],y_pred_MNB))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2495
           1       0.88      0.85      0.87      2505

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



## Linear SVC

In [78]:
Lsvc=LinearSVC()
Lsvc.fit(x_train_count, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [79]:
Acc_Lsvc = Lsvc.score(x_train_count, train_y)
acc_Lsvc = Lsvc.score(x_test_count, test['label'])
print ('Train Accuracy : {:.2f}%'.format(Acc_Lsvc*100))
print ('Test Accuracy : {:.2f}%'.format(acc_Lsvc*100))

Train Accuracy : 98.96%
Test Accuracy : 89.78%


In [80]:
y_pred_Lsvc = Lsvc.predict(x_test_count)
print(classification_report(test['label'], y_pred_Lsvc))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2495
           1       0.89      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



## Neural Network Model

In [102]:
model = Sequential() 
#layers
model.add(Dense(50,input_dim=x_train_count.shape[1], kernel_initializer="uniform", activation="relu")) 
model.add(Dropout(0.2))
model.add(Dense(6, kernel_initializer="uniform", activation="relu")) 
model.add(Dense(1, kernel_initializer="uniform", activation="sigmoid")) 
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Fit the model
history = model.fit(x_train_count, train_y.values.reshape(-1,1), validation_data=(x_valid_count,valid_y), nb_epoch=2, batch_size=128)

Train on 40000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


In [103]:
loss, acc_NN = model.evaluate(x_test_count, test["label"], verbose=0)
print('Test Accuracy: %f' % (acc_NN*100))

Test Accuracy: 90.039998


## Final Report

In [None]:
output = pd.DataFrame({"Model":['MultinomialNB','Linear SVC','NN'],
                      "Accuracy":[acc_MNB, acc_Lsvc,acc_NN]})
output

## Making Prediction

In [46]:
comment_1 = pd.Series("this film is very nice and good i like it")
comment_2 = pd.Series("no not good look at that shit very bad")

In [47]:
comment_1  = vectorizer.transform(comment_1)
comment_2 = vectorizer.transform(comment_2)

In [50]:
model.predict_classes(comment_2)

array([[0]], dtype=int32)

## Saving model

In [52]:
import pickle

In [54]:
pickle.dump(vectorizer, open('transform.pkl', 'wb'))

In [55]:
pickle.dump(mnb, open('model.pkl', 'wb'))