In [29]:
!pip install gensim
!pip install python-Levenshtein



In [30]:
import gensim
import pandas as pd
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.metrics import classification_report, confusion_matrix
from gensim.models import Word2Vec
from tqdm import tqdm
import urllib.request
import zipfile


In [31]:
# Parameters
MAX_VOCAB = 20000
MAX_LEN = 200
EMBED_DIM = 100   # keep 100 to match common GloVe dims
W2V_DIM = 100
BATCH_SIZE = 64  # Changed batch size
EPOCHS = 10 # Changed number of epochs
LSTM_UNITS = 128

In [32]:
# Load IMDB dataset (Keras) - returns (train_data, train_labels), (test_data, test_labels)
import tensorflow_datasets as tfds
ds_train, ds_test = tfds.load('imdb_reviews', split=['train','test'], as_supervised=True)
train_texts = []
train_labels = []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(int(label))
test_texts = []
test_labels = []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(int(label))
print("Train examples:", len(train_texts), "Test examples:", len(test_texts))


Train examples: 25000 Test examples: 25000


In [33]:
# Simple preprocessing + tokenization
import re
def clean_text(s):
    s = s.lower()
    s = re.sub(r"<br\s*/?>", " ", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

train_texts_clean = [clean_text(t) for t in train_texts]
test_texts_clean = [clean_text(t) for t in test_texts]

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts_clean)
X_train_seq = tokenizer.texts_to_sequences(train_texts_clean)
X_test_seq = tokenizer.texts_to_sequences(test_texts_clean)
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

word_index = tokenizer.word_index
print("Vocab size (word_index):", len(word_index))


Vocab size (word_index): 74482


In [34]:
# Train FastText on the training corpus (gensim)
from gensim.models import FastText

sentences = [t.split() for t in train_texts_clean]
ft_model = FastText(sentences, vector_size=W2V_DIM, window=5, min_count=2, workers=4, epochs=5)
ft_vocab = ft_model.wv.key_to_index
print("FastText vocab size:", len(ft_vocab))

FastText vocab size: 46938


In [35]:
# Prepare embedding matrix for FastText
num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_ft = np.zeros((num_words, W2V_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in ft_model.wv:
        embedding_matrix_ft[i] = ft_model.wv[word]

In [36]:
# Download GloVe (100d) and load into memory (if not exists)
GLOVE_ZIP = "glove.6B.zip"
GLOVE_URL = "http://nlp.stanford.edu/data/glove.6B.zip"
if not os.path.exists(GLOVE_ZIP):
    print("Downloading GloVe...")
    urllib.request.urlretrieve(GLOVE_URL, GLOVE_ZIP)
    with zipfile.ZipFile(GLOVE_ZIP, 'r') as z:
        z.extractall()
glove_path = "glove.6B.100d.txt"
print("GloVe file exists:", os.path.exists(glove_path))


GloVe file exists: True


In [37]:
# Load GloVe embeddings into dict
embeddings_index = {}
with open(glove_path, 'r', encoding='utf8') as f:
    for line in tqdm(f, desc="Loading GloVe"):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print("Loaded GloVe vectors:", len(embeddings_index))


Loading GloVe: 400000it [00:10, 38549.39it/s]

Loaded GloVe vectors: 400000





In [38]:
# Prepare embedding matrix for GloVe
EMBED_DIM = 100
num_words = min(MAX_VOCAB, len(word_index)+1)
embedding_matrix_glove = np.zeros((num_words, EMBED_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix_glove[i] = vec


In [39]:
# Define function to build an LSTM model with a provided embedding matrix
def build_model(embedding_matrix, trainable=False):
    vocab_size, embed_dim = embedding_matrix.shape
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=embed_dim,
                        weights=[embedding_matrix],
                        input_length=MAX_LEN,
                        trainable=trainable))
    model.add(Bidirectional(LSTM(LSTM_UNITS, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [40]:
# Train model using FastText embeddings
model_ft = build_model(embedding_matrix_ft, trainable=False)
model_ft.summary()
history_ft = model_ft.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)



Epoch 1/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.6231 - loss: 0.6416 - val_accuracy: 0.7956 - val_loss: 0.4551
Epoch 2/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.7801 - loss: 0.4801 - val_accuracy: 0.8136 - val_loss: 0.4305
Epoch 3/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8083 - loss: 0.4306 - val_accuracy: 0.8232 - val_loss: 0.4177
Epoch 4/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.8397 - loss: 0.3733 - val_accuracy: 0.8384 - val_loss: 0.3879
Epoch 5/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8517 - loss: 0.3459 - val_accuracy: 0.8340 - val_loss: 0.3842
Epoch 6/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 28ms/step - accuracy: 0.8706 - loss: 0.3087 - val_accuracy: 0.8464 - val_loss: 0.3760
Epoch 7/10
[1m352/3

In [41]:
# Evaluate FastText model and report metrics
y_pred_prob_ft = model_ft.predict(X_test, batch_size=512)
y_pred_ft = (y_pred_prob_ft >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred_ft, digits=4))

[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
              precision    recall  f1-score   support

           0     0.8639    0.8282    0.8456     12500
           1     0.8350    0.8695    0.8519     12500

    accuracy                         0.8488     25000
   macro avg     0.8494    0.8488    0.8488     25000
weighted avg     0.8494    0.8488    0.8488     25000



In [42]:
# Train model using GloVe embeddings
model_glove = build_model(embedding_matrix_glove, trainable=False)
model_glove.summary()
history_glove = model_glove.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)




Epoch 1/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.5738 - loss: 0.6770 - val_accuracy: 0.5980 - val_loss: 0.6723
Epoch 2/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.6649 - loss: 0.6111 - val_accuracy: 0.7400 - val_loss: 0.5046
Epoch 3/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.7697 - loss: 0.4958 - val_accuracy: 0.8052 - val_loss: 0.4318
Epoch 4/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8191 - loss: 0.4048 - val_accuracy: 0.8368 - val_loss: 0.3709
Epoch 5/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 21ms/step - accuracy: 0.8391 - loss: 0.3682 - val_accuracy: 0.8408 - val_loss: 0.3591
Epoch 6/10
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 23ms/step - accuracy: 0.8507 - loss: 0.3445 - val_accuracy: 0.8348 - val_loss: 0.3658
Epoch 7/10
[1m352/35

In [43]:
# Evaluate GloVe model and report metrics
y_pred_prob_g = model_glove.predict(X_test, batch_size=512)
y_pred_g = (y_pred_prob_g >= 0.5).astype(int).reshape(-1)
print(classification_report(y_test, y_pred_g, digits=4))


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step
              precision    recall  f1-score   support

           0     0.8748    0.8120    0.8423     12500
           1     0.8246    0.8838    0.8532     12500

    accuracy                         0.8479     25000
   macro avg     0.8497    0.8479    0.8477     25000
weighted avg     0.8497    0.8479    0.8477     25000



In [44]:
#  Quick comparison table
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def get_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }

metrics_ft = get_metrics(y_test, y_pred_ft)
metrics_glove = get_metrics(y_test, y_pred_g) # Assuming y_pred_g from previous GloVe evaluation is still available
import pandas as pd
df = pd.DataFrame([metrics_ft, metrics_glove], index=['FastText','GloVe']).T
df

Unnamed: 0,FastText,GloVe
accuracy,0.84884,0.84792
precision,0.834985,0.824601
recall,0.86952,0.88384
f1,0.851903,0.853193


In [45]:
# (Optional) Make embedding layer trainable and fine-tune for GloVe
model_glove_ft = build_model(embedding_matrix_glove, trainable=True)
history_glove_ft = model_glove_ft.fit(X_train, y_train, epochs=2, batch_size=BATCH_SIZE, validation_split=0.1)
y_pred_prob_gft = model_glove_ft.predict(X_test, batch_size=512)
y_pred_gft = (y_pred_prob_gft >= 0.5).astype(int).reshape(-1)
print("Fine-tuned GloVe metrics:")
print(classification_report(y_test, y_pred_gft, digits=4))


Epoch 1/2




[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.6287 - loss: 0.6349 - val_accuracy: 0.8152 - val_loss: 0.4575
Epoch 2/2
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.8357 - loss: 0.3919 - val_accuracy: 0.8664 - val_loss: 0.3254
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step
Fine-tuned GloVe metrics:
              precision    recall  f1-score   support

           0     0.8388    0.8554    0.8470     12500
           1     0.8524    0.8356    0.8439     12500

    accuracy                         0.8455     25000
   macro avg     0.8456    0.8455    0.8455     25000
weighted avg     0.8456    0.8455    0.8455     25000



In [46]:
# Save models and embedding matrices
model_ft.save("lstm_ft.h5")
model_glove.save("lstm_glove.h5")
np.save("embedding_matrix_ft.npy", embedding_matrix_ft)
np.save("embedding_matrix_glove.npy", embedding_matrix_glove)
print("Saved models and embeddings.")



Saved models and embeddings.
