In [30]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, InputLayer

In [2]:
vocab_size = 10000  # Number of words to consider as features
maxlen = 300        # Cut texts after this number of words
embedding_dim = 128

In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

In [4]:
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [5]:
model = Sequential([
    InputLayer(input_shape=(maxlen,)),  # Explicitly define the input shape
    Embedding(vocab_size, embedding_dim),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])




In [6]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [7]:
model.summary()

In [8]:
history = model.fit(x_train, y_train, epochs=5, batch_size=64, validation_data=(x_test, y_test))

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 525ms/step - accuracy: 0.7149 - loss: 0.5328 - val_accuracy: 0.8702 - val_loss: 0.3129
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 521ms/step - accuracy: 0.8942 - loss: 0.2667 - val_accuracy: 0.8719 - val_loss: 0.3126
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404ms/step - accuracy: 0.9344 - loss: 0.1821

KeyboardInterrupt: 

In [9]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 91ms/step - accuracy: 0.8573 - loss: 0.3366
Test Loss: 0.3341296911239624
Test Accuracy: 0.8575199842453003


In [11]:
def preprocess_input(text, tokenizer, maxlen):
    # Tokenize the text
    sequences = tokenizer.texts_to_sequences([text])
    # Pad the sequences
    padded_sequences = pad_sequences(sequences, maxlen=maxlen)
    return padded_sequences

In [29]:
word_index = imdb.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}

# Instantiate a tokenizer and fit on the training data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts([reverse_word_index.get(i - 3, "?") for i in range(3, vocab_size + 3)])

# Custom input text
custom_text = "I did not enjoy the movie. It was boring and too long.The music is also bad"

# Preprocess the custom input
custom_input = preprocess_input(custom_text, tokenizer, maxlen)

# Predict the sentiment of the custom input
prediction = model.predict(custom_input)
print(f"Predicted Sentiment: {'Positive' if prediction[0][0] > 0.7 else 'Negative'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Predicted Sentiment: Negative
Prediction Probability: 0.42216676473617554
