In [1]:
!pip install gensim
!pip install python-Levenshtein

Collecting python-Levenshtein
  Using cached python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Using cached levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Using cached rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Using cached python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Using cached levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
Using cached rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1 rapidfuzz-3.14.1


In [2]:
import gensim
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from tqdm import tqdm
import urllib.request
import zipfile


In [3]:
# Parameters
MAX_VOCAB = 20000
MAX_LEN = 200
EMBED_DIM = 100   # keep 100 to match common GloVe dims
W2V_DIM = 100
BATCH_SIZE = 128
EPOCHS = 5
LSTM_UNITS = 128


In [6]:
# Load IMDB dataset (Keras) - returns (train_data, train_labels), (test_data, test_labels)
import tensorflow_datasets as tfds
ds_train, ds_test = tfds.load('imdb_reviews', split=['train','test'], as_supervised=True)
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(int(label))
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(int(label))
print("Train examples:", len(train_texts), "Test examples:", len(test_texts))


Train examples: 25000 Test examples: 25000


In [8]:
# Simple preprocessing + tokenization
import re
def clean_text(s):
    s = s.lower()
    s = re.sub(r"<br\s*/?>", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

train_texts_clean = [clean_text(t) for t in train_texts]
test_texts_clean = [clean_text(t) for t in test_texts]

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts_clean)
X_train_seq = tokenizer.texts_to_sequences(train_texts_clean)
X_test_seq = tokenizer.texts_to_sequences(test_texts_clean)
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

word_index = tokenizer.word_index
print("Vocab size (word_index):", len(word_index))


Vocab size (word_index): 74482


In [22]:
# Train FastText on the training corpus (gensim)
from gensim.models import FastText

sentences = [t.split() for t in train_texts_clean]
ft_model = FastText(sentences, vector_size=W2V_DIM, window=5, min_count=2, workers=4, epochs=5)
ft_vocab = ft_model.wv.key_to_index
print("FastText vocab size:", len(ft_vocab))

FastText vocab size: 46938


In [23]:
# Prepare embedding matrix for FastText
num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_ft = np.zeros((num_words, W2V_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in ft_model.wv:
        embedding_matrix_ft[i] = ft_model.wv[word]

In [11]:
# Download GloVe (100d) and load into memory (if not exists)
GLOVE_ZIP = "glove.6B.zip"
GLOVE_URL = "http://nlp.stanford.edu/data/glove.6B.zip"
if not os.path.exists(GLOVE_ZIP):
    print("Downloading GloVe...")
    urllib.request.urlretrieve(GLOVE_URL, GLOVE_ZIP)
    with zipfile.ZipFile(GLOVE_ZIP, 'r') as z:
        z.extractall()
glove_path = "glove.6B.100d.txt"
print("GloVe file exists:", os.path.exists(glove_path))


Downloading GloVe...
GloVe file exists: True


In [12]:
# Load GloVe embeddings into dict
embeddings_index = {}
with open(glove_path, 'r', encoding='utf8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print("Loaded GloVe vectors:", len(embeddings_index))


Loading GloVe: 400000it [00:08, 44681.03it/s]

Loaded GloVe vectors: 400000





In [13]:
# Prepare embedding matrix for GloVe
EMBED_DIM = 100
num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_glove = np.zeros((num_words, EMBED_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix_glove[i] = vec


In [14]:
# Define function to build an LSTM model with a provided embedding matrix
def build_model(embedding_matrix, trainable=False):
    vocab_size, embed_dim = embedding_matrix.shape
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embed_dim,
                        weights=[embedding_matrix],
                        input_length=MAX_LEN,
                        trainable=trainable))
    model.add(Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [24]:
# Train model using FastText embeddings
model_ft = build_model(embedding_matrix_ft, trainable=False)
model_ft.summary()
history_ft = model_ft.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)



Epoch 1/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.6045 - loss: 0.6555 - val_accuracy: 0.7512 - val_loss: 0.5118
Epoch 2/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.7779 - loss: 0.4760 - val_accuracy: 0.7780 - val_loss: 0.4712
Epoch 3/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.8122 - loss: 0.4186 - val_accuracy: 0.8152 - val_loss: 0.4303
Epoch 4/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.8327 - loss: 0.3854 - val_accuracy: 0.8264 - val_loss: 0.4012
Epoch 5/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.8555 - loss: 0.3410 - val_accuracy: 0.8304 - val_loss: 0.3857


In [25]:
# Evaluate FastText model and report metrics
y_pred_prob_ft = model_ft.predict(X_test, batch_size=512)
y_pred_ft = (y_pred_prob_ft >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred_ft, digits=4))

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
              precision    recall  f1-score   support

           0     0.8646    0.7759    0.8179     12500
           1     0.7968    0.8785    0.8356     12500

    accuracy                         0.8272     25000
   macro avg     0.8307    0.8272    0.8267     25000
weighted avg     0.8307    0.8272    0.8267     25000



In [17]:
# Train model using GloVe embeddings
model_glove = build_model(embedding_matrix_glove, trainable=False)
model_glove.summary()
history_glove = model_glove.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)




Epoch 1/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.5680 - loss: 0.6777 - val_accuracy: 0.5808 - val_loss: 0.7595
Epoch 2/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.6472 - loss: 0.6298 - val_accuracy: 0.7832 - val_loss: 0.4628
Epoch 3/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.7785 - loss: 0.4726 - val_accuracy: 0.8120 - val_loss: 0.4155
Epoch 4/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.8071 - loss: 0.4286 - val_accuracy: 0.8140 - val_loss: 0.4192
Epoch 5/5
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.8266 - loss: 0.3896 - val_accuracy: 0.8288 - val_loss: 0.3823


In [18]:
# Evaluate GloVe model and report metrics
y_pred_prob_g = model_glove.predict(X_test, batch_size=512)
y_pred_g = (y_pred_prob_g >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred_g, digits=4))


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
              precision    recall  f1-score   support

           0     0.7892    0.8848    0.8343     12500
           1     0.8689    0.7637    0.8129     12500

    accuracy                         0.8242     25000
   macro avg     0.8291    0.8242    0.8236     25000
weighted avg     0.8291    0.8242    0.8236     25000



In [26]:
#  Quick comparison table
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def get_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

metrics_ft = get_metrics(y_test, y_pred_ft)
metrics_glove = get_metrics(y_test, y_pred_g) # Assuming y_pred_g from previous GloVe evaluation is still available
import pandas as pd
df = pd.DataFrame([metrics_ft, metrics_glove], index=['FastText','GloVe']).T
df

Unnamed: 0,FastText,GloVe
accuracy,0.8272,0.82424
precision,0.796764,0.868924
recall,0.87848,0.76368
f1,0.835629,0.81291


In [20]:
# (Optional) Make embedding layer trainable and fine-tune for GloVe
model_glove_ft = build_model(embedding_matrix_glove, trainable=True)
history_glove_ft = model_glove_ft.fit(X_train, y_train, epochs=2, batch_size=BATCH_SIZE, validation_split=0.1)
y_pred_prob_gft = model_glove_ft.predict(X_test, batch_size=512)
y_pred_gft = (y_pred_prob_gft >= 0.5).astype(int).reshape(-1)
print("Fine-tuned GloVe metrics:")
print(classification_report(y_test, y_pred_gft, digits=4))


Epoch 1/2




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.5878 - loss: 0.6689 - val_accuracy: 0.7132 - val_loss: 0.5748
Epoch 2/2
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.6932 - loss: 0.5926 - val_accuracy: 0.7456 - val_loss: 0.5546
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
Fine-tuned GloVe metrics:
              precision    recall  f1-score   support

           0     0.6934    0.7748    0.7319     12500
           1     0.7449    0.6574    0.6984     12500

    accuracy                         0.7161     25000
   macro avg     0.7191    0.7161    0.7151     25000
weighted avg     0.7191    0.7161    0.7151     25000



In [27]:
# Save models and embedding matrices
model_ft.save("lstm_ft.h5")
model_glove.save("lstm_glove.h5")
np.save("embedding_matrix_ft.npy", embedding_matrix_ft)
np.save("embedding_matrix_glove.npy", embedding_matrix_glove)
print("Saved models and embeddings.")



Saved models and embeddings.
