## End to end Deep Learning Project Using Simple RNN

In [3]:
# !pip install -r requirements.txt

In [12]:
import re
import pickle
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [13]:
# --------------------------
# Step 1: Load raw IMDB dataset
# --------------------------
train_data, test_data = tfds.load(
    "imdb_reviews",
    split=["train", "test"],
    as_supervised=True
)

train_texts = [text.numpy().decode("utf-8") for text, label in train_data]
train_labels = [label.numpy() for text, label in train_data]

test_texts = [text.numpy().decode("utf-8") for text, label in test_data]
test_labels = [label.numpy() for text, label in test_data]



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.DA0KFW_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.DA0KFW_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.DA0KFW_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [14]:
# --------------------------
# Step 2: Tokenizer
# --------------------------
max_features = 10000   # vocabulary size
max_len = 200          # shorter than 500 helps RNNs learn faster

tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Save tokenizer
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Convert to sequences + pad
x_train_seq = tokenizer.texts_to_sequences(train_texts)
x_test_seq = tokenizer.texts_to_sequences(test_texts)

x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding="post", truncating="post")
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding="post", truncating="post")

y_train = np.array(train_labels)
y_test = np.array(test_labels)


In [15]:
# --------------------------
# Step 3: Build the model
# --------------------------
model = Sequential([
    Embedding(max_features, 128, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])




In [16]:
# --------------------------
# Step 4: Train
# --------------------------
early_stopping = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

history = model.fit(
    x_train_pad, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.5112 - loss: 0.6934 - val_accuracy: 0.5228 - val_loss: 0.6919
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5523 - loss: 0.6853 - val_accuracy: 0.5726 - val_loss: 0.6551
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.6129 - loss: 0.6290 - val_accuracy: 0.7656 - val_loss: 0.5538
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.7603 - loss: 0.5111 - val_accuracy: 0.6898 - val_loss: 0.6135
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.8094 - loss: 0.4617 - val_accuracy: 0.7150 - val_loss: 0.6563


In [17]:
# --------------------------
# Step 5: Helper functions
# --------------------------
def preprocess_text(text, max_len=200):
    # Clean text (optional)
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len, padding="post", truncating="post")
    return padded

def predict_sentiment(review):
    processed = preprocess_text(review)
    prediction = model.predict(processed)
    sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
    return sentiment, prediction[0][0]

In [18]:
# --------------------------
# Step 6: Test predictions
# --------------------------
examples = [
    "The movie was amazing and full of joy!",
    "The movie was terrible and I hated it",
    "It was just okay, not too bad, not too good",
]

for example in examples:
    sentiment, prob = predict_sentiment(example)
    print(f"Review: {example}")
    print(f"Sentiment: {sentiment}, Probability: {prob:.4f}\n")
    print("=====================================================")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
Review: The movie was amazing and full of joy!
Sentiment: Positive, Probability: 0.7626

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Review: The movie was terrible and I hated it
Sentiment: Negative, Probability: 0.1809

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
Review: It was just okay, not too bad, not too good
Sentiment: Negative, Probability: 0.1809



In [38]:
model.summary()

In [39]:
model.save("sentiment_LSTM_model.h5")

