In [None]:
!pip install datasets transformers gensim scikit-learn torch tensorflow tqdm --quiet

import re, string, time
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
import torch



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 1. Load Dataset
dataset = load_dataset("imdb")
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# 2. Preprocessing

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)

X_train, X_val, y_train, y_val = train_test_split(
    train_df["text"], train_df["label"], test_size=0.2, random_state=42
)

In [None]:
# 3. TF–IDF Model

tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val[:5000])

lr_tfidf = LogisticRegression(max_iter=200)
lr_tfidf.fit(X_train_tfidf, y_train)
pred_tfidf = lr_tfidf.predict(X_val_tfidf)

acc_tfidf = accuracy_score(y_val[:5000], pred_tfidf)
p_tfidf = precision_score(y_val[:5000], pred_tfidf)
r_tfidf = recall_score(y_val[:5000], pred_tfidf)
f1_tfidf = f1_score(y_val[:5000], pred_tfidf)

print(" TF–IDF Model Report\n")
print(classification_report(y_val[:5000], pred_tfidf))
print("-"*65)

 TF–IDF Model Report

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2515
           1       0.88      0.90      0.89      2485

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

-----------------------------------------------------------------


In [None]:
# 4. Word2Vec Model

train_tokens = [txt.split() for txt in X_train]
val_tokens = [txt.split() for txt in X_val[:5000]]

w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=2, workers=4)

def get_w2v_features(tokens_list, model):
    features = []
    for tokens in tokens_list:
        vectors = [model.wv[word] for word in tokens if word in model.wv]
        features.append(np.mean(vectors, axis=0) if len(vectors) > 0 else np.zeros(100))
    return np.array(features)

X_train_w2v = get_w2v_features(train_tokens, w2v_model)
X_val_w2v = get_w2v_features(val_tokens, w2v_model)

lr_w2v = LogisticRegression(max_iter=200)
lr_w2v.fit(X_train_w2v, y_train)
pred_w2v = lr_w2v.predict(X_val_w2v)

acc_w2v = accuracy_score(y_val[:5000], pred_w2v)
p_w2v = precision_score(y_val[:5000], pred_w2v)
r_w2v = recall_score(y_val[:5000], pred_w2v)
f1_w2v = f1_score(y_val[:5000], pred_w2v)

print(" Word2Vec Model Report\n")
print(classification_report(y_val[:5000], pred_w2v))
print("-"*65)


 Word2Vec Model Report

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      2515
           1       0.82      0.84      0.83      2485

    accuracy                           0.83      5000
   macro avg       0.83      0.83      0.83      5000
weighted avg       0.83      0.83      0.83      5000

-----------------------------------------------------------------


In [None]:
# 5. Fine-Tuned BERT (7 Epochs + Progress Bar + Scheduler)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_batch(texts, labels, max_len=256):
    enc = tokenizer.batch_encode_plus(
        texts.tolist(), add_special_tokens=True,
        truncation=True, padding="max_length",
        max_length=max_len, return_attention_mask=True,
        return_tensors="pt"
    )
    return TensorDataset(enc["input_ids"], enc["attention_mask"], torch.tensor(labels.tolist()))

train_ds = encode_batch(X_train[:8000], y_train[:8000])
val_ds = encode_batch(X_val[:5000], y_val[:5000])

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 7
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)

# Training with tqdm progress bar
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Training", leave=False)

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids, mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
    print(f"Average loss: {total_loss/len(train_loader):.4f}")

# Evaluation
model.eval()
preds, true_labels = [], []
progress_bar = tqdm(val_loader, desc="Evaluating")

with torch.no_grad():
    for batch in progress_bar:
        input_ids, mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=mask)
        y_pred = torch.argmax(outputs.logits, dim=1)
        preds.extend(y_pred.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

acc_bert = accuracy_score(true_labels, preds)
p_bert = precision_score(true_labels, preds)
r_bert = recall_score(true_labels, preds)
f1_bert = f1_score(true_labels, preds)

print("\n🔹 BERT Model Report (7 Epochs + Progress Bar)\n")
print(classification_report(true_labels, preds))
print(f"Final Accuracy: {acc_bert:.4f}")
print("-"*65)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.4082

Epoch 2/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.1993

Epoch 3/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.0944

Epoch 4/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.0406

Epoch 5/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.0215

Epoch 6/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.0128

Epoch 7/7


Training:   0%|          | 0/500 [00:00<?, ?it/s]

Average loss: 0.0103


Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]


🔹 BERT Model Report (7 Epochs + Progress Bar)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      2515
           1       0.91      0.91      0.91      2485

    accuracy                           0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000

Final Accuracy: 0.9088
-----------------------------------------------------------------


In [None]:
# 6. Results Comparison Table
results = pd.DataFrame({
    "Model": ["TF–IDF", "Word2Vec", "BERT (7 Epochs)"],
    "Accuracy": [acc_tfidf, acc_w2v, acc_bert],
    "Precision": [p_tfidf, p_w2v, p_bert],
    "Recall": [r_tfidf, r_w2v, r_bert],
    "F1 Score": [f1_tfidf, f1_w2v, f1_bert]
})
print("\n✅ Comparison Table:\n")
print(results)


✅ Comparison Table:

             Model  Accuracy  Precision    Recall  F1 Score
0           TF–IDF    0.8868   0.878800  0.895775  0.887206
1         Word2Vec    0.8276   0.820371  0.836217  0.828218
2  BERT (7 Epochs)    0.9088   0.908249  0.908249  0.908249
