In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/Colab Notebooks/lab05/Lab 5.zip" -d "content/"

Archive:  /content/drive/MyDrive/Colab Notebooks/lab05/Lab 5.zip
   creating: content/Lab 5/
  inflating: content/Lab 5/GOOG.csv  
  inflating: content/Lab 5/IMDB Dataset.csv  
  inflating: content/Lab 5/Q1.ipynb  
  inflating: content/Lab 5/Q2.ipynb  
  inflating: content/Lab 5/Q3.ipynb  


In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import re

In [3]:
# 1. Load and Preprocess the Dataset
def load_data(file_path):
    # Load the dataset (e.g., IMDB movie reviews dataset)
    df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')  # Using 'python' engine and skipping bad lines
    df.dropna(inplace=True)  # Drop any rows with missing values
    return df['review'], df['sentiment']  # Assuming 'review' and 'sentiment' columns

In [5]:
# Clean the text
def clean_text(text):
    # Remove unwanted characters, numbers, and symbols
    text = re.sub(r"[^A-Za-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [6]:
# Tokenize and Pad Sequences
def preprocess_text(reviews, max_words=5000, max_len=200):
    reviews = [clean_text(review) for review in reviews]  # Clean the reviews
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(reviews)
    sequences = tokenizer.texts_to_sequences(reviews)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    return padded_sequences, tokenizer



In [7]:
# Encode Sentiments
def encode_labels(sentiments):
    sentiments = sentiments.map({'positive': 1, 'negative': 0}).values
    return sentiments

In [8]:
import os
print(os.listdir("content/Lab 5"))
file_path = 'content/Lab 5/IMDB Dataset.csv'
reviews, sentiments = load_data(file_path)


['IMDB Dataset.csv', 'Q2.ipynb', 'GOOG.csv', 'Q3.ipynb', 'Q1.ipynb']


In [9]:
# Preprocess Text Data
max_words = 5000  # Consider the top 5000 words
max_len = 200  # Pad or truncate reviews to 200 words
X, tokenizer = preprocess_text(reviews, max_words=max_words, max_len=max_len)

In [10]:
# Encode Sentiments (positive -> 1, negative -> 0)
y = encode_labels(sentiments)

# Split into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 2. Define the LSTM Model
model = Sequential()

# Modify the embedding dimensions and experiment with LSTM configurations ---
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))  # <-- Modify 'output_dim'
model.add(Bidirectional(LSTM(units=64, return_sequences=False)))  # <-- Experiment with 'units' and add Dropout if necessary

model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Train the Model
#  Modify 'epochs' and 'batch_size' to see how they impact training time and model accuracy ---
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)  # <-- Experiment with 'epochs' and 'batch_size'


Epoch 1/10




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 251ms/step - accuracy: 0.7462 - loss: 0.4907 - val_accuracy: 0.8733 - val_loss: 0.3000
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 261ms/step - accuracy: 0.8871 - loss: 0.2770 - val_accuracy: 0.8796 - val_loss: 0.2975
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 256ms/step - accuracy: 0.9137 - loss: 0.2191 - val_accuracy: 0.8743 - val_loss: 0.3099
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 264ms/step - accuracy: 0.9313 - loss: 0.1783 - val_accuracy: 0.8860 - val_loss: 0.3139
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 257ms/step - accuracy: 0.9473 - loss: 0.1422 - val_accuracy: 0.8862 - val_loss: 0.3129
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 256ms/step - accuracy: 0.9570 - loss: 0.1201 - val_accuracy: 0.8741 - val_loss: 0.3715
Epo

<keras.src.callbacks.history.History at 0x7e5ad201cce0>

In [13]:
# 4. Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype("int32")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 66ms/step


In [14]:
# Calculate Accuracy and F1-Score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'F1-Score: {f1:.4f}')


#  Analyze the accuracy and F1-score. Consider modifying the model architecture or hyperparameters to improve performance ---

Accuracy: 0.8758
F1-Score: 0.8781


In [17]:
# ===============================
# Unidirectional LSTM Model
# ===============================
model_uni = Sequential()
model_uni.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_uni.add(LSTM(units=64, return_sequences=False))  # Unidirectional
model_uni.add(Dense(1, activation='sigmoid'))

model_uni.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the Unidirectional model
history_uni = model_uni.fit(
    X_train, y_train,
    epochs=10, batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1
)

# Evaluate Unidirectional model
y_pred_uni = (model_uni.predict(X_test) > 0.5).astype("int32")

acc_uni = accuracy_score(y_test, y_pred_uni)
f1_uni = f1_score(y_test, y_pred_uni)

print(f'Unidirectional LSTM Accuracy: {acc_uni:.4f}')
print(f'Unidirectional LSTM F1-Score: {f1_uni:.4f}')




Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 150ms/step - accuracy: 0.7579 - loss: 0.4916 - val_accuracy: 0.8716 - val_loss: 0.3113
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 152ms/step - accuracy: 0.8878 - loss: 0.2808 - val_accuracy: 0.8769 - val_loss: 0.2849
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 148ms/step - accuracy: 0.9111 - loss: 0.2292 - val_accuracy: 0.8689 - val_loss: 0.3181
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 146ms/step - accuracy: 0.9285 - loss: 0.1852 - val_accuracy: 0.8857 - val_loss: 0.3001
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 152ms/step - accuracy: 0.9452 - loss: 0.1448 - val_accuracy: 0.8750 - val_loss: 0.3291
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 149ms/step - accuracy: 0.9546 - loss: 0.1224 - val_accuracy: 0.8801 - val_loss:

## Comparison: Bidirectional vs Unidirectional LSTM

### Results
- **Bidirectional LSTM**
  - Accuracy: 0.8758
  - F1-score: 0.8781

- **Unidirectional LSTM**
  - Accuracy: 0.8692
  - F1-score: 0.8701

### Analysis
- The **Bidirectional LSTM** performed slightly better in both accuracy and F1-score.
- This improvement happens because it processes the text **from both directions (past + future context)**, which helps capture important sentiment cues (e.g., negations like *"not good"*).
- The **Unidirectional LSTM** only reads text in the forward direction, so it sometimes misses context when important words appear later.
- However, the performance gap is **small**, and the unidirectional model trains **faster** and is **less resource-intensive**.
- For tasks where **efficiency is more important than slight performance gains**, the unidirectional model is still a strong option.
