<a href="https://colab.research.google.com/github/Sarahmahmoud16/NLP-Task/blob/main/Assignment_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import imdb

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.split()
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]

    return " ".join(text)

In [49]:
vocab_size = 10000
max_length = 200
embedding_dim = 128
batch_size = 64
epochs = 5

In [6]:
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}
def decode_review(encoded_review):
    return " ".join([reverse_word_index.get(i, "?") for i in encoded_review])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [39]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

In [40]:
x_train_texts = [preprocess_text(decode_review(review)) for review in x_train]
x_test_texts = [preprocess_text(decode_review(review)) for review in x_test]

In [41]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")  # Ensure unseen words get mapped
tokenizer.fit_on_texts(x_train_texts)
x_train_seq = tokenizer.texts_to_sequences(x_train_texts)
x_test_seq = tokenizer.texts_to_sequences(x_test_texts)

In [42]:
x_train_pad = pad_sequences(x_train_seq, maxlen=max_length, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_length, padding='post', truncating='post')

In [43]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 128, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Helps prevent overfitting
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [47]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), metrics=['accuracy'])

In [50]:
model.fit(x_train_pad, y_train, epochs=epochs, batch_size=64, validation_data=(x_test_pad, y_test))

Epoch 1/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 60ms/step - accuracy: 0.9357 - loss: 0.1944 - val_accuracy: 0.8463 - val_loss: 0.3860
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 57ms/step - accuracy: 0.9533 - loss: 0.1607 - val_accuracy: 0.8460 - val_loss: 0.3999
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 58ms/step - accuracy: 0.9633 - loss: 0.1343 - val_accuracy: 0.8373 - val_loss: 0.4730
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 67ms/step - accuracy: 0.9703 - loss: 0.1091 - val_accuracy: 0.8345 - val_loss: 0.5251
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 57ms/step - accuracy: 0.9770 - loss: 0.0973 - val_accuracy: 0.8328 - val_loss: 0.5839


<keras.src.callbacks.history.History at 0x78439c6cfb90>

In [51]:
loss, accuracy = model.evaluate(x_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.8348 - loss: 0.5760
Test Accuracy: 0.8328


In [65]:
def predict_sentiment(text):
    # Preprocess the input text
    cleaned_text = preprocess_text(text)

    # Convert to sequence
    seq = tokenizer.texts_to_sequences([cleaned_text])

    # Pad the sequence
    pad_seq = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')

    # Make prediction
    prediction = model.predict(pad_seq)[0][0]

    # Interpret result
    sentiment = "Positive" if prediction > 0.5 else "Negative "
    print(f"Review: {text}\nSentiment: {sentiment} (Score: {prediction:.4f})")

# Test with some examples
predict_sentiment("I love this movie! It's fantastic.")
predict_sentiment("This movie is bad! I hate it.")
predict_sentiment("The movie is very beautiful! I will watch it again.")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Review: I love this movie! It's fantastic.
Sentiment: Positive (Score: 0.6459)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Review: This movie is bad! I hate it.
Sentiment: Negative  (Score: 0.2078)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Review: The movie is very beautiful! I will watch it again.
Sentiment: Positive (Score: 0.5443)
