In [None]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# Add Q2/functions to sys.path whether the notebook runs from repo root or /Q2
cwd = Path.cwd()
if (cwd / "functions").exists():
    functions_dir = cwd / "functions"
else:
    functions_dir = cwd / "Q2" / "functions"

sys.path.insert(0, str(functions_dir))

from model_heads import SentimentMLPClassifier, LinearHead
from eval_utils import evaluate_model  # detailed plots + ROC

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
if (cwd / "train_df_processed.csv").exists():
    train_path = cwd / "train_df_processed.csv"
    test_path = cwd / "test_df_processed.csv"
else:
    train_path = cwd.parent / "train_df_processed.csv"
    test_path = cwd.parent / "test_df_processed.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Basic safety
train_df = train_df.dropna(subset=["processed_text", "sentiment_class"])
test_df = test_df.dropna(subset=["processed_text", "sentiment_class"])

print("train_df:", train_df.shape)
print("test_df :", test_df.shape)
train_df.head()


In [None]:
# -------------------- Train/Val split (stratified) --------------------
X_text = train_df["processed_text"].astype(str).values
y = train_df["sentiment_class"].astype(int).values

X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text,
    y,
    test_size=0.20,
    random_state=SEED,
    stratify=y,
)

X_test_text = test_df["processed_text"].astype(str).values
y_test = test_df["sentiment_class"].astype(int).values

print("Train size:", len(X_train_text))
print("Val size  :", len(X_val_text))
print("Test size :", len(X_test_text))


In [None]:
# ==================== 1) Vectorization: Frozen BERT embeddings ====================
from transformers import BertTokenizer, BertModel

PRETRAINED_MODEL = "bert-base-uncased"
MAX_LENGTH = 128
BERT_BATCH_SIZE = 64

In [None]:
class TextDataset(Dataset):
    # Dataset that tokenizes text for BERT
    def __init__(self, texts, labels, tokenizer, max_length):
        """
        Args:
            texts (list): List of text samples.
            labels (list): List of sentiment labels (e.g., 0, 1).
            tokenizer (transformers.BertTokenizer): Tokenizer for BERT.
            max_length (int): Maximum length for tokenized sequences.
        """
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize and encode the text
        text = self.texts[idx]
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Extract embeddings for all data
def extract_embeddings(bert_model, dataloader, device):
    """
    Extracts embeddings for all data using a pre-trained BERT model.

    Args:
        model (transformers.BertModel): Pre-trained BERT model.
        dataloader (DataLoader): DataLoader for the dataset.
        device (torch.device): Device to run the model on (CPU or GPU).

    Returns:
        torch.Tensor: A matrix of size (number_of_samples, embedding_size).
    """
    bert_model.eval()  # Set the model to evaluation mode
    embeddings = []

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass through BERT
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            pooled = outputs.pooler_output  # CLS token representation

            # Append embeddings to the list
            embeddings.append(pooled.cpu())
            
    # Combine all embeddings into a single matrix
    return torch.cat(embeddings, dim=0)


In [None]:
# -------------------- Compute or load cached embeddings --------------------
# Caching avoids recomputing BERT embeddings every run.

out_dir = Path("outputs") if (cwd / "outputs").exists() else (cwd / "Q2" / "outputs")
out_dir.mkdir(parents=True, exist_ok=True)

train_emb_path = out_dir / "bert_train_embeddings.npy"
val_emb_path = out_dir / "bert_val_embeddings.npy"
test_emb_path = out_dir / "bert_test_embeddings.npy"

if train_emb_path.exists() and val_emb_path.exists() and test_emb_path.exists():
    print("Loading cached embeddings from:", out_dir)
    X_train_emb = np.load(train_emb_path)
    X_val_emb = np.load(val_emb_path)
    X_test_emb = np.load(test_emb_path)
else:
    print("Computing BERT embeddings (this may take a while)...")

    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL)
    bert = BertModel.from_pretrained(PRETRAINED_MODEL).to(device)

    train_ds = TextDataset(X_train_text, y_train, tokenizer, MAX_LENGTH)
    val_ds = TextDataset(X_val_text, y_val, tokenizer, MAX_LENGTH)
    test_ds = TextDataset(X_test_text, y_test, tokenizer, MAX_LENGTH)

    train_loader = DataLoader(train_ds, batch_size=BERT_BATCH_SIZE, shuffle=False)
    val_loader = DataLoader(val_ds, batch_size=BERT_BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=BERT_BATCH_SIZE, shuffle=False)

    train_emb = extract_embeddings(bert, train_loader, device)
    val_emb = extract_embeddings(bert, val_loader, device)
    test_emb = extract_embeddings(bert, test_loader, device)

    X_train_emb = train_emb.numpy()
    X_val_emb = val_emb.numpy()
    X_test_emb = test_emb.numpy()

    np.save(train_emb_path, X_train_emb)
    np.save(val_emb_path, X_val_emb)
    np.save(test_emb_path, X_test_emb)

    print("Saved embeddings to:", out_dir)

print("X_train_emb:", X_train_emb.shape)
print("X_val_emb  :", X_val_emb.shape)
print("X_test_emb :", X_test_emb.shape)


In [None]:
# ==================== 2) After vectorization cleaning ====================
# 1) Sanity checks
assert not np.isnan(X_train_emb).any(), "NaNs found in train embeddings"
assert not np.isinf(X_train_emb).any(), "Infs found in train embeddings"

# 2) Standardization (recommended for MLP stability)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_emb)
X_val_scaled = scaler.transform(X_val_emb)
X_test_scaled = scaler.transform(X_test_emb)

print("Scaled embeddings computed.")
print("Train mean (first 5 dims):", X_train_scaled.mean(axis=0)[:5])
print("Train std  (first 5 dims):", X_train_scaled.std(axis=0)[:5])


In [None]:
class EmbeddingDataset(Dataset):
    # Dataset that serves precomputed embeddings
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {
            "x": self.X[idx],
            "label": self.y[idx],
        }

# DataLoaders for the MLP
MLP_BATCH_SIZE = 128

train_loader = DataLoader(EmbeddingDataset(X_train_scaled, y_train), batch_size=MLP_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(EmbeddingDataset(X_val_scaled, y_val), batch_size=MLP_BATCH_SIZE, shuffle=False)
test_loader = DataLoader(EmbeddingDataset(X_test_scaled, y_test), batch_size=MLP_BATCH_SIZE, shuffle=False)

print("MLP_BATCH_SIZE:", MLP_BATCH_SIZE)


In [None]:
# ==================== 3) Modeling ====================
NUM_CLASSES = len(np.unique(y_train))
INPUT_DIM = X_train_scaled.shape[1]

print("INPUT_DIM:", INPUT_DIM)
print("NUM_CLASSES:", NUM_CLASSES)

# Define multiple MLP structures to justify/compare (same training loop)
experiments = [
    {
        "name": "LinearHead_baseline",
        "hidden_layers": (),
        "dropout": 0.0,
        "activation": "relu",
    },
    {
        "name": "MLP_256_64_drop0.2",
        "hidden_layers": (256, 64),
        "dropout": 0.2,
        "activation": "relu",
    },
    {
        "name": "MLP_128_32_drop0.2",
        "hidden_layers": (128, 32),
        "dropout": 0.2,
        "activation": "relu",
    },
    {
        "name": "MLP_512_128_drop0.3",
        "hidden_layers": (512, 128),
        "dropout": 0.3,
        "activation": "relu",
    },
]

experiments


In [None]:
from copy import deepcopy

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0

    for batch in loader:
        x = batch["x"].to(device)
        yb = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * x.size(0)

    return running_loss / len(loader.dataset)


def predict_proba(model, loader):
    model.eval()
    all_logits = []
    all_y = []

    with torch.no_grad():
        for batch in loader:
            x = batch["x"].to(device)
            yb = batch["label"].cpu().numpy()

            logits = model(x).cpu().numpy()
            all_logits.append(logits)
            all_y.append(yb)

    logits = np.vstack(all_logits)
    y_true = np.concatenate(all_y)

    # Softmax -> probabilities
    exp = np.exp(logits - logits.max(axis=1, keepdims=True))
    probs = exp / exp.sum(axis=1, keepdims=True)

    y_pred = probs.argmax(axis=1)
    return y_true, y_pred, probs


def macro_f1(y_true, y_pred):
    return float(f1_score(y_true, y_pred, average="macro"))


In [None]:
# Train each experiment with the same loop, select best by validation macro-F1

EPOCHS = 30
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-2
PATIENCE = 5  # early stopping patience

results = []
best_models = {}

criterion = nn.CrossEntropyLoss()

for cfg in experiments:
    name = cfg["name"]
    hidden_layers = cfg["hidden_layers"]
    dropout = cfg["dropout"]
    activation = cfg["activation"]

    if len(hidden_layers) == 0:
        model = LinearHead(INPUT_DIM, NUM_CLASSES).to(device)
    else:
        model = SentimentMLPClassifier(
            input_dim=INPUT_DIM,
            num_classes=NUM_CLASSES,
            hidden_layers=hidden_layers,
            dropout=dropout,
            activation=activation,
        ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=2, factor=0.5)

    best_val_f1 = -1.0
    best_state = None
    epochs_no_improve = 0

    history = []

    print("\n" + "="*100)
    print("Training:", name)
    print("hidden_layers:", hidden_layers, "dropout:", dropout)

    for epoch in range(1, EPOCHS + 1):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion)

        yv_true, yv_pred, yv_probs = predict_proba(model, val_loader)
        val_f1 = macro_f1(yv_true, yv_pred)

        scheduler.step(val_f1)

        history.append({
            "epoch": epoch,
            "train_loss": train_loss,
            "val_f1_macro": val_f1,
            "lr": optimizer.param_groups[0]["lr"],
        })

        print(f"Epoch {epoch:02d} | train_loss={train_loss:.4f} | val_f1_macro={val_f1:.4f} | lr={optimizer.param_groups[0]['lr']:.2e}")

        if val_f1 > best_val_f1 + 1e-4:
            best_val_f1 = val_f1
            best_state = deepcopy(model.state_dict())
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping triggered (patience={PATIENCE}).")
            break

    # Restore best model for this configuration
    if best_state is not None:
        model.load_state_dict(best_state)

    # Final validation metrics for logging
    yv_true, yv_pred, yv_probs = predict_proba(model, val_loader)

    results.append({
        "model": name,
        "val_f1_macro": macro_f1(yv_true, yv_pred),
    })

    best_models[name] = model

# Results table
results_df = pd.DataFrame(results).sort_values("val_f1_macro", ascending=False)
results_df

In [None]:
# ==================== 4) Results ====================
# Pick the best configuration by validation macro-F1, then evaluate on test.

best_name = results_df.iloc[0]["model"]
print("Best model by val_f1_macro:", best_name)

best_model = best_models[best_name]

yt_true, yt_pred, yt_probs = predict_proba(best_model, test_loader)

print("Test macro-F1:", macro_f1(yt_true, yt_pred))

# Export full report plot (confusion matrix + ROC, etc.)
# It writes under Q2/reports (or ./reports if running from /Q2)
report_dir = Path("reports") if (cwd / "reports").exists() else (cwd / "Q2" / "reports")
report_dir.mkdir(parents=True, exist_ok=True)

evaluate_model(
    y_true=yt_true,
    y_pred=yt_pred,
    y_probs=yt_probs,
    model_name=f"Q2_{best_name}",
    output_path=str(report_dir),
)
