<a href="https://colab.research.google.com/github/Mrbilalali/sentiment-analyzer/blob/main/GRU_Sentiment_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import re
from textblob import TextBlob
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense


# **Drive Mount**

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# **Load the full dataset**

In [10]:
df_full = pd.read_csv('/content/drive/MyDrive/IMPORTANT PORTFOLIO/Artificial Intelligence/LAB/IMDB Dataset.csv')

# Take a random subset of 5000 reviews
df = df_full.sample(n=50000, random_state=42).reset_index(drop=True)
df = df[['review', 'sentiment']]  # only relevant columns
print(f"Dataset subset size: {len(df)} reviews")
print(df.head(2))

Dataset subset size: 50000 reviews
                                              review sentiment
0  I really liked this Summerslam due to the look...  positive
1  Not many television shows appeal to quite as m...  positive


# **Cleaning Data**

In [11]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Lowercase
    text = text.lower()
    # Keep only letters and numbers (replace others with space)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df['cleaned'] = df['review'].apply(clean_text)

# Show before/after
orig = df.loc[0, 'review']
cleaned = df.loc[0, 'cleaned']
print("Original:", orig[:60], "...")
print("Cleaned :", cleaned[:60], "...")

Original: I really liked this Summerslam due to the look of the arena, ...
Cleaned : i really liked this summerslam due to the look of the arena  ...


# **Data tokenizer and Pad sequences to fixed length**

In [14]:
max_vocab = 50000  # only keep the top 50k most frequent words
tokenizer = Tokenizer(num_words=max_vocab, oov_token='<UNK>')
tokenizer.fit_on_texts(df['cleaned'])

# Convert texts to integer sequences
sequences = tokenizer.texts_to_sequences(df['cleaned'])

# Pad sequences to fixed length (100 words per review)
max_len = 100
X = pad_sequences(sequences, maxlen=max_len, padding='post')
print(f"Padded sequences shape: {X.shape}")



Padded sequences shape: (50000, 100)


# **Compute polarity and assign categories**

In [15]:
def categorize_sentiment(text, thresh=0.05):
    score = TextBlob(text).sentiment.polarity
    if score > thresh:
        return 'positive'
    elif score < -thresh:
        return 'negative'
    else:
        return 'neutral'

df['sentiment3'] = df['cleaned'].apply(lambda x: categorize_sentiment(x))
print(df['sentiment3'].value_counts())

sentiment3
positive    31966
neutral     10314
negative     7720
Name: count, dtype: int64


# **Encode classes to 0/1/2**

In [16]:
le = LabelEncoder()
y_int = le.fit_transform(df['sentiment3'])  # e.g. {'negative':0,'neutral':1,'positive':2}
num_classes = len(le.classes_)
Y = to_categorical(y_int, num_classes=num_classes)
print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

Class mapping: {'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


# **Data sequential**

In [29]:
vocab_size = max_vocab
embed_dim = 100
gru_units = 64

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len),
    GRU(units=gru_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [30]:
history = model.fit(X, Y, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 179ms/step - accuracy: 0.6509 - loss: 0.8373 - val_accuracy: 0.7419 - val_loss: 0.5760
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 179ms/step - accuracy: 0.7829 - loss: 0.5047 - val_accuracy: 0.7459 - val_loss: 0.5706
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 180ms/step - accuracy: 0.8572 - loss: 0.3525 - val_accuracy: 0.7299 - val_loss: 0.6439
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 181ms/step - accuracy: 0.9151 - loss: 0.2269 - val_accuracy: 0.7135 - val_loss: 0.8220
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 178ms/step - accuracy: 0.9522 - loss: 0.1381 - val_accuracy: 0.7170 - val_loss: 1.0013


# **Example texts for prediction**

In [31]:
sample_texts = [
    "I absolutely loved this movie, it was fantastic!",   # likely Positive
    "The movie was okay, not great but not terrible.",    # maybe Neutral
    "I hated the film; it was the worst I've seen.",     # likely Negative
    "An utterly boring and dull movie that I disliked."  # likely Negative
]

# Preprocess texts (same cleaning and tokenization as training data)
sample_clean = [clean_text(t) for t in sample_texts]
seqs = tokenizer.texts_to_sequences(sample_clean)
pads = pad_sequences(seqs, maxlen=max_len, padding='post')

# Predict
pred_probs = model.predict(pads)
pred_indices = pred_probs.argmax(axis=1)
pred_classes = [le.classes_[i] for i in pred_indices]

# Print results
for text, pred in zip(sample_texts, pred_classes):
    print(f"Review: {text}\nPredicted sentiment: {pred}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 587ms/step
Review: I absolutely loved this movie, it was fantastic!
Predicted sentiment: positive

Review: The movie was okay, not great but not terrible.
Predicted sentiment: positive

Review: I hated the film; it was the worst I've seen.
Predicted sentiment: negative

Review: An utterly boring and dull movie that I disliked.
Predicted sentiment: negative



 # **saves architecture, weights, and optimizer state**

In [32]:
from tensorflow.keras.models import load_model
# ... after training your GRU model ...
model.save('gru_model.h5')




# **Assume tokenizer is your fitted Keras Tokenizer object**

In [33]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
