In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data.shape, data["sentiment"].value_counts()

((50000, 2),
 sentiment
 positive    25000
 negative    25000
 Name: count, dtype: int64)

In [4]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [5]:
data.head(7)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1


In [6]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=64)

In [7]:
print(train_data.shape)
print(test_data.shape)

(35000, 2)
(15000, 2)


In [8]:
tokenizer = Tokenizer(num_words=5000)

In [9]:
tokenizer.fit_on_texts(train_data["review"])

In [10]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [11]:
print(X_train)

[[   0    0    0 ... 1182   29    1]
 [   0    0    0 ...  129  229 4100]
 [   0    0    0 ...   17    7    7]
 ...
 [   0    0    0 ...  176    2  164]
 [ 296   20  238 ... 1511    7    7]
 [   0    0    0 ...  163   11   53]]


In [12]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [13]:
print(Y_train)

25848    1
24888    0
10380    1
11359    0
48792    0
        ..
49206    0
36006    0
22647    0
21478    1
39364    0
Name: sentiment, Length: 35000, dtype: int64


In [14]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [15]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [16]:
model.summary()

In [17]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 98ms/step - accuracy: 0.7097 - loss: 0.5399 - val_accuracy: 0.8336 - val_loss: 0.3805
Epoch 2/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 96ms/step - accuracy: 0.8504 - loss: 0.3572 - val_accuracy: 0.8534 - val_loss: 0.3484
Epoch 3/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 97ms/step - accuracy: 0.8683 - loss: 0.3214 - val_accuracy: 0.8254 - val_loss: 0.3934
Epoch 4/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 98ms/step - accuracy: 0.8864 - loss: 0.2832 - val_accuracy: 0.8643 - val_loss: 0.3225
Epoch 5/5
[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 98ms/step - accuracy: 0.8956 - loss: 0.2609 - val_accuracy: 0.8797 - val_loss: 0.3095


<keras.src.callbacks.history.History at 0x25c6149b4d0>

In [18]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.8660 - loss: 0.3322
Test Loss: 0.3317755162715912
Test Accuracy: 0.8675333261489868


In [19]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment, prediction[0][0]

In [20]:
# example
new_review = "This movie was fantastic. I loved it."
sentiment, percentage = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}. I am {(percentage)*100:.2f}% confident of the result!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
The sentiment of the review is: positive. I am 94.57% confident of the result!
