## Import Needed Package

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv")

In [6]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data.shape

(50000, 2)

In [8]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [9]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


## One Hot Encoding

### Label Encoder

In [10]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

In [11]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Data Preprocessing

In [12]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [14]:
train_data.shape

(40000, 2)

In [15]:
test_data.shape

(10000, 2)

In [16]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data['review'])

In [17]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=100)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=100)

In [18]:
X_train

array([[   7,    1,  269, ...,  205,  351, 3856],
       [ 154,   84,   22, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [ 294,    8,   11, ...,   70,   73, 2062]], dtype=int32)

In [19]:
X_test

array([[ 210,   25, 1176, ...,  995,  719,  155],
       [  23,    2, 1926, ...,  380,    7,    7],
       [ 148,    6,    1, ...,   50, 1088,   96],
       ...,
       [2171, 3773,    8, ...,  125,  200, 3241],
       [  30,    1, 2069, ..., 1066,    1, 2305],
       [1101,   46,    6, ...,    1,  332,   27]], dtype=int32)

In [20]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [21]:
Y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


In [22]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

In [23]:
model.summary()

In [24]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [25]:
model.fit(X_train, Y_train, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 209ms/step - accuracy: 0.7255 - loss: 0.5228 - val_accuracy: 0.8357 - val_loss: 0.3723
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 207ms/step - accuracy: 0.8643 - loss: 0.3257 - val_accuracy: 0.8506 - val_loss: 0.3467
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 208ms/step - accuracy: 0.8894 - loss: 0.2708 - val_accuracy: 0.8534 - val_loss: 0.3335
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 213ms/step - accuracy: 0.9053 - loss: 0.2393 - val_accuracy: 0.8535 - val_loss: 0.3492
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 208ms/step - accuracy: 0.9178 - loss: 0.2072 - val_accuracy: 0.8440 - val_loss: 0.3799


<keras.src.callbacks.history.History at 0x7b780e7d71a0>

In [26]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 61ms/step - accuracy: 0.8467 - loss: 0.3718


In [27]:
print(loss)


0.366588294506073


In [28]:
print(accuracy)

0.8485999703407288


## Building a Predictive System

In [30]:
def predictive_system(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [31]:
predictive_system("this movie was amazing")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   


'positive'

In [32]:
predictive_system("this movie was horrible")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452ms/step


'negative'

In [33]:
predictive_system("this movie was trilling with stunning visual")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step


'positive'

In [35]:
predictive_system("A visual masterpiece")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step


'positive'

In [36]:
model.save('model.h5')



In [37]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']