In [41]:
#So first of all we are going to import all the libraries
#numpy,dataset from keras, padding, sequential model, Lstm,dense
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [42]:
#Load DAtasets

vocab_size = 30000 #top 10k words only to reduce complexity
maxlen = 200 #max length of each review
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

#Now we will pad sequences so that all the reviews have the same reviews

X_train = pad_sequences(X_train, maxlen = maxlen )
x_test = pad_sequences(X_test, maxlen = maxlen )

# Let's Build our Lstm model

In [43]:
model = Sequential ([
    #word embedding
    Embedding(input_dim= vocab_size, output_dim =128, input_length = maxlen),
    #LSTM layer
    LSTM(128, return_sequences = False ),
    Dropout(0.5),
    #binary classification
    Dense(1,activation='sigmoid')
])

In [44]:
#Step 4: Compile Model

model.compile(loss= 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [45]:
#Step 5: Let's train our model

history = model.fit(X_train, y_train, epochs = 5, batch_size =64, validation_split=0.2)

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 541ms/step - accuracy: 0.7137 - loss: 0.5357 - val_accuracy: 0.8596 - val_loss: 0.3297
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m168s[0m 537ms/step - accuracy: 0.9161 - loss: 0.2275 - val_accuracy: 0.8618 - val_loss: 0.3693
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 534ms/step - accuracy: 0.9543 - loss: 0.1342 - val_accuracy: 0.8734 - val_loss: 0.3716
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 540ms/step - accuracy: 0.9391 - loss: 0.1626 - val_accuracy: 0.8662 - val_loss: 0.4005
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 536ms/step - accuracy: 0.9749 - loss: 0.0758 - val_accuracy: 0.8676 - val_loss: 0.5117


In [46]:
#Step 6 : Now it's time to evaluate on test data

loss, acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {acc*100:.2f}%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 105ms/step - accuracy: 0.8443 - loss: 0.5916
Test Accuracy: 84.43%


In [47]:
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

#Example review(covert words -> integers)

def encode_review(text):
  tokens = [word_index.get(word, 2) for word in text.lower().split()]
  return pad_sequences ([tokens], maxlen= maxlen)

sample_review = "This movie was bad  "
encoded = encode_review(sample_review)
prediction = model.predict(encoded)
print("Positive" if prediction [0][0]> 0.5 else "Negative")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step
Positive


In [48]:
from tensorflow.keras.datasets import imdb

word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

def encode_review(text):
    tokens = [word_index.get(word, 2) for word in text.lower().split()]  # 2 = <UNK>
    return pad_sequences([tokens], maxlen=200)

review = "This movie was very good"
encoded = encode_review(review)
prediction = model.predict(encoded)
print("Positive" if prediction[0][0] > 0.5 else "Negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Positive


In [49]:
preds = model.predict(x_test[:20])
print(preds)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[[4.5153070e-02]
 [9.9912280e-01]
 [7.1210468e-01]
 [5.8373413e-03]
 [9.9988085e-01]
 [8.4859401e-01]
 [9.9799532e-01]
 [1.2022696e-03]
 [9.9416435e-01]
 [9.9957258e-01]
 [9.9607146e-01]
 [1.4120619e-03]
 [4.3119340e-05]
 [8.5139228e-04]
 [9.9816543e-01]
 [3.1683318e-04]
 [9.9971741e-01]
 [4.1807536e-02]
 [1.8256143e-05]
 [2.6391740e-03]]


In [50]:
word_index = imdb.get_word_index()
print("bad" in word_index)   # True/False
print("good" in word_index)
print("boring" in word_index)


True
True
True


In [51]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token="<UNK>")
tokenizer.fit_on_texts(["This movie was bad", "I love this movie"])  # or full dataset

seq = tokenizer.texts_to_sequences(["This movie was bad"])
padded = pad_sequences(seq, maxlen=200)
pred = model.predict(padded)
print(pred)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[[0.4513538]]
