In [74]:
import pandas as pd
import numpy as np
import csv
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [75]:
data = pd.read_csv("IMDB Dataset.csv")

In [76]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [77]:
pd.isnull(data).sum()

Unnamed: 0,0
review,0
sentiment,0


In [78]:
data.shape

(50000, 2)

In [79]:
type(data)

In [80]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [81]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [82]:
data.replace({"sentiment":{"positive":1,"negative":0}},inplace=True)

In [83]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [84]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
49999,No one expects the Star Trek movies to be high...,0


In [85]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [86]:
train_data,test_data = train_test_split(data, test_size = 0.2,random_state=42)

In [87]:
train_data.shape

(40000, 2)

In [88]:
test_data.shape

(10000, 2)

In [89]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])

In [90]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

In [91]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [92]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [93]:
Y_train = train_data["sentiment"]
Y_test  = test_data["sentiment"]

In [94]:
Y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


In [95]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))

# **Model_Summary**

In [97]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [98]:
history = model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 652ms/step - accuracy: 0.7277 - loss: 0.5165 - val_accuracy: 0.8512 - val_loss: 0.3465
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 673ms/step - accuracy: 0.8567 - loss: 0.3438 - val_accuracy: 0.8569 - val_loss: 0.3409
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 655ms/step - accuracy: 0.8786 - loss: 0.3020 - val_accuracy: 0.8565 - val_loss: 0.3878
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 650ms/step - accuracy: 0.8967 - loss: 0.2597 - val_accuracy: 0.8724 - val_loss: 0.3100
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 653ms/step - accuracy: 0.9100 - loss: 0.2323 - val_accuracy: 0.8655 - val_loss: 0.3321


In [99]:
model.save("model.h5")



In [115]:
import joblib
joblib.dump(tokenizer,"tokenizer.pkl")

['tokenizer.pkl']

In [110]:
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)

In [113]:
print(f"Test Loss: {loss:.5}")

Test Loss: 0.31967


In [114]:
print(f"Test Accuracy: {accuracy:.5}")

Test Accuracy: 0.8698


# Building Predictive system

In [136]:
def predictive_system(review):
  sequences=tokenizer.texts_to_sequences([review])
  padded_sequences = pad_sequences(sequences,maxlen=200)
  prediction=model.predict(padded_sequences)
  sentiment = "positive" if prediction [0][0] > 0.5 else "negative"
  return sentiment,prediction

In [139]:
predictive_system("A masterpiece of cinema, I was completely captivated.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


('positive', array([[0.94071436]], dtype=float32))

In [140]:
predictive_system("This movie was absolutely terrible, a complete waste of time.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step


('negative', array([[0.00422534]], dtype=float32))

In [141]:
predictive_system("boring and too slow from 2nd half")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step


('negative', array([[0.00733973]], dtype=float32))

In [142]:
predictive_system("haha i enjoyed a lot from starting to ending and in some scenes the movie is slow but it was good")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step


('positive', array([[0.52467835]], dtype=float32))

In [143]:
predictive_system("2 stars for it")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step


('negative', array([[0.34066084]], dtype=float32))

In [144]:
predictive_system("3")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step


('negative', array([[0.04221069]], dtype=float32))