## Talha Aslam
## 24i-8067
## MS(DS)-Section-A

In [1]:
!pip install gensim
!pip install python-Levenshtein

Collecting python-Levenshtein
  Using cached python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Using cached levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packa

# Importing Libraries

In [2]:
import gensim
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from tqdm import tqdm
import urllib.request
import zipfile


# Setting parramters

In [3]:
# Parameters
MAX_VOCAB = 20000
MAX_LEN = 200
EMBED_DIM = 100   # keep 100 to match common GloVe dims
W2V_DIM = 100
BATCH_SIZE = 128
EPOCHS = 5
LSTM_UNITS = 128


# Loading IMDB dataset

In [4]:
# Load IMDB dataset (Keras) - returns (train_data, train_labels), (test_data, test_labels)
import tensorflow_datasets as tfds
ds_train, ds_test = tfds.load('imdb_reviews', split=['train','test'], as_supervised=True)
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(int(label))
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(int(label))
print("Train examples:", len(train_texts), "Test examples:", len(test_texts))




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SH33MD_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SH33MD_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.SH33MD_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
Train examples: 25000 Test examples: 25000


# Simple preprocessing + tokenization

In [5]:

import re
def clean_text(s):
    s = s.lower()
    s = re.sub(r"<br\s*/?>", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

train_texts_clean = [clean_text(t) for t in train_texts]
test_texts_clean = [clean_text(t) for t in test_texts]

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts_clean)
X_train_seq = tokenizer.texts_to_sequences(train_texts_clean)
X_test_seq = tokenizer.texts_to_sequences(test_texts_clean)
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

word_index = tokenizer.word_index
print("Vocab size (word_index):", len(word_index))


Vocab size (word_index): 74482


# Train Word2Vec on the training corpus (gensim)

In [6]:

sentences = [t.split() for t in train_texts_clean]
w2v_model = Word2Vec(sentences, vector_size=W2V_DIM, window=5, min_count=2, workers=4, epochs=5)
w2v_vocab = w2v_model.wv.key_to_index
print("Word2Vec vocab size:", len(w2v_vocab))


Word2Vec vocab size: 46938


# Prepare embedding matrix for Word2Vec

In [7]:

num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_w2v = np.zeros((num_words, W2V_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in w2v_model.wv:
        embedding_matrix_w2v[i] = w2v_model.wv[word]


# Download GloVe (100d) and load into memory (if not exists)

In [8]:

GLOVE_ZIP = "glove.6B.zip"
GLOVE_URL = "http://nlp.stanford.edu/data/glove.6B.zip"
if not os.path.exists(GLOVE_ZIP):
    print("Downloading GloVe...")
    urllib.request.urlretrieve(GLOVE_URL, GLOVE_ZIP)
    with zipfile.ZipFile(GLOVE_ZIP, 'r') as z:
        z.extractall()
glove_path = "glove.6B.100d.txt"
print("GloVe file exists:", os.path.exists(glove_path))


Downloading GloVe...
GloVe file exists: True


# Load GloVe embeddings into dictionary

In [9]:

embeddings_index = {}
with open(glove_path, 'r', encoding='utf8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print("Loaded GloVe vectors:", len(embeddings_index))


Loading GloVe: 400000it [00:14, 26695.70it/s]

Loaded GloVe vectors: 400000





# Prepare embedding matrix for GloVe

In [10]:

EMBED_DIM = 100
num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_glove = np.zeros((num_words, EMBED_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix_glove[i] = vec


# Define function to build an LSTM model with a provided embedding matrix

In [11]:

def build_model(embedding_matrix, trainable=False):
    vocab_size, embed_dim = embedding_matrix.shape
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embed_dim,
                        weights=[embedding_matrix],
                        input_length=MAX_LEN,
                        trainable=trainable))
    model.add(Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# Train model using Word2Vec embeddings

In [None]:

model_w2v = build_model(embedding_matrix_w2v, trainable=False)
model_w2v.summary()
history_w2v = model_w2v.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)




Epoch 1/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 1s/step - accuracy: 0.6654 - loss: 0.5993 - val_accuracy: 0.8020 - val_loss: 0.4386
Epoch 2/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 1s/step - accuracy: 0.7986 - loss: 0.4509 - val_accuracy: 0.8408 - val_loss: 0.3876
Epoch 3/5
[1m 98/176[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:37[0m 1s/step - accuracy: 0.8394 - loss: 0.3831

# Evaluate Word2Vec model and report metrics

In [None]:

y_pred_prob = model_w2v.predict(X_test, batch_size=512)
y_pred = (y_pred_prob >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred, digits=4))


# Train model using GloVe embeddings

In [None]:

model_glove = build_model(embedding_matrix_glove, trainable=False)
model_glove.summary()
history_glove = model_glove.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)


# Evaluate GloVe model and report metrics

In [None]:

y_pred_prob_g = model_glove.predict(X_test, batch_size=512)
y_pred_g = (y_pred_prob_g >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred_g, digits=4))


#  Quick comparison table

In [None]:

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def get_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

metrics_w2v = get_metrics(y_test, y_pred)
metrics_glove = get_metrics(y_test, y_pred_g)
import pandas as pd
df = pd.DataFrame([metrics_w2v, metrics_glove], index=['Word2Vec','GloVe']).T
df


# (Optional) Make embedding layer trainable and fine-tune for GloVe

In [None]:

model_glove_ft = build_model(embedding_matrix_glove, trainable=True)
history_glove_ft = model_glove_ft.fit(X_train, y_train, epochs=2, batch_size=BATCH_SIZE, validation_split=0.1)
y_pred_prob_gft = model_glove_ft.predict(X_test, batch_size=512)
y_pred_gft = (y_pred_prob_gft >= 0.5).astype(int).reshape(-1)
print("Fine-tuned GloVe metrics:")
print(classification_report(y_test, y_pred_gft, digits=4))


# Save models and embedding matrices

In [None]:

model_w2v.save("lstm_w2v.h5")
model_glove.save("lstm_glove.h5")
np.save("embedding_matrix_w2v.npy", embedding_matrix_w2v)
np.save("embedding_matrix_glove.npy", embedding_matrix_glove)
print("Saved models and embeddings.")
