In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [3]:
# Preprocessing
# Load the IMDB dataset
max_words = 10000  # Vocabulary size
max_len = 500  # Maximum review length

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

# Pad sequences to ensure uniform input size
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)


In [4]:
# define RNN
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),  # You can use GRU or simple RNN here
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [5]:
# Train Model
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2  # Use 20% of training data for validation
)


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 129ms/step - accuracy: 0.7071 - loss: 0.5398 - val_accuracy: 0.8742 - val_loss: 0.3197
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 128ms/step - accuracy: 0.9053 - loss: 0.2498 - val_accuracy: 0.8656 - val_loss: 0.3220
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 128ms/step - accuracy: 0.9202 - loss: 0.2116 - val_accuracy: 0.8592 - val_loss: 0.3643
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 129ms/step - accuracy: 0.9407 - loss: 0.1606 - val_accuracy: 0.8716 - val_loss: 0.3500
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 132ms/step - accuracy: 0.9506 - loss: 0.1337 - val_accuracy: 0.8658 - val_loss: 0.3984


In [6]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc}")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 17ms/step - accuracy: 0.8575 - loss: 0.4298
Test Accuracy: 0.8578799962997437


In [10]:
word_index = imdb.get_word_index()
word_index = {k: (v + 3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3

In [12]:
def predict_sentiment(model, review, word_index, max_len=500):
    """
    Predict the sentiment score of a given review using a trained model.
    
    Args:
        model: Trained sentiment analysis model.
        review: A string containing the review text.
        word_index: Dictionary mapping words to integer indices (IMDB word index).
        max_len: The maximum length for padding/truncating the review.
        
    Returns:
        score: Sentiment score (float) between 0 and 1.
    """
    # Tokenize the review
    def tokenize_review(review, word_index):
        tokens = []
        for word in review.lower().split():
            word = word.strip("!?.")
            tokens.append(word_index.get(word, word_index["<UNK>"]))
        return tokens

    # Tokenize and pad the review
    tokenized_review = tokenize_review(review, word_index)
    padded_review = pad_sequences([tokenized_review], maxlen=max_len, padding='post')
    
    # Predict the sentiment score
    score = model.predict(padded_review)[0][0]
    
    return score

In [14]:
sample_review = "This movie was fantastic! The acting was great and the story was compelling."
score = predict_sentiment(model, sample_review, word_index, max_len=500)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step


In [18]:
if score > 0.5:
    print("Positive", score)
else:
    print("Negative", score)

Positive 0.81742954
