In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


# from transformers.utils import logging as hf_logging
# hf_logging.set_verbosity_error() 

import sys
import random
from pathlib import Path
project_root = Path().resolve().parent
sys.path.append(str(project_root / "src"))
from tqdm.notebook import tqdm

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import make_scorer, f1_score, confusion_matrix, ConfusionMatrixDisplay

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from itertools import product

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from torch.utils.data import DataLoader
from dataset.custom_data import IMDBDataset
from models.lstm import GloVeLSTM
from utils.embeddings import load_glove_embeddings


from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [2]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
set_seed(42)

In [4]:
df = pd.read_parquet(project_root / "data" / "imdb_reviews.parquet")
df = df[["review", "sentiment"]].dropna().reset_index(drop=True)

X = df["review"].astype(str).to_list()
y = df["sentiment"].astype(int).to_numpy()

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(f1_score)

pipe = Pipeline([
    ("vec", TfidfVectorizer(min_df=5, ngram_range=(1,2), sublinear_tf=True)),
    ("clf", LinearSVC())
])


In [6]:
param_grid = [
    {  # LinearSVC branch
        "clf": [LinearSVC()],
        "vec__min_df": [3, 5, 10],
        "vec__max_features": [40000, 60000, None],
        "vec__ngram_range": [(1,1), (1,2)],
        "clf__C": [0.5, 1.0, 2.0],
    },
    {  # SGDClassifier branch
        "clf": [SGDClassifier(random_state=42)],
        "vec__min_df": [3, 5],
        "vec__ngram_range": [(1,1), (1,2)],
        "clf__loss": ["hinge", "log_loss"],
        "clf__alpha": [1e-5, 1e-4, 1e-3],
        "clf__penalty": ["l2", "l1"],
        "clf__max_iter": [1000, 2000],
    }
]

gs = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)
gs.fit(X, y)

print("Best F1:", gs.best_score_)
print("Best params:", gs.best_params_)
best_model = gs.best_estimator_

Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best F1: 0.9110504394179648
Best params: {'clf': LinearSVC(), 'clf__C': 0.5, 'vec__max_features': None, 'vec__min_df': 3, 'vec__ngram_range': (1, 2)}


In [7]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total, loss_sum, correct = 0, 0.0, 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.float().to(device)
        optimizer.zero_grad()
        logits = model(xb)               # (B,)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        loss_sum += float(loss.item()) * yb.size(0)
        preds = (logits.sigmoid() >= 0.5).long()
        correct += (preds == yb.long()).sum().item()
        total += yb.size(0)
    return loss_sum/total, correct/total

@torch.no_grad()
def eval_epoch(model, loader, criterion):
    model.eval()
    total, loss_sum, correct = 0, 0.0, 0
    ys, ps = [], []
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.float().to(device)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss_sum += float(loss.item()) * yb.size(0)

        prob = logits.sigmoid()
        pred = (prob >= 0.5).long()
        correct += (pred == yb.long()).sum().item()
        total += yb.size(0)

        ys.append(yb.cpu().numpy())
        ps.append(pred.cpu().numpy())

    y = np.concatenate(ys)
    p = np.concatenate(ps)
    return loss_sum/total, correct/total, y, p


In [8]:
def train_lstm_one_fold(params, tr_idx, va_idx, texts, labels, glove_filename: str):
    tr_texts = [texts[i] for i in tr_idx]
    tr_labels = [labels[i] for i in tr_idx]
    va_texts = [texts[i] for i in va_idx]
    va_labels = [labels[i] for i in va_idx]

    # Build train fold dataset (creates vocab from train fold)
    train_ds = IMDBDataset(
        texts=tr_texts,
        labels=tr_labels,
        max_len=params["max_len"],
        preprocess=True,
        min_freq=params.get("min_freq", 2),
        max_vocab_size=params.get("max_vocab_size", 30000),
        language="english"
    )

    # Validation dataset shares the same vocab
    val_ds = IMDBDataset(
        texts=va_texts,
        labels=va_labels,
        max_len=params["max_len"],
        preprocess=True,
        min_freq=params.get("min_freq", 2),
        max_vocab_size=params.get("max_vocab_size", 30000),
        language="english"
    )
    val_ds.vocab = train_ds.vocab  # align token->id mapping

    train_loader = DataLoader(train_ds, batch_size=params["batch_size"], shuffle=True, num_workers=0)
    val_loader   = DataLoader(val_ds,   batch_size=params["batch_size"], shuffle=False, num_workers=0)

    # Load GloVe embeddings for THIS fold's vocab (OOV get random)
    glove_tensor = load_glove_embeddings(glove_filename, train_ds.vocab, embedding_dim=params["emb_dim"])

    # Build model
    model = GloVeLSTM(
                vocab_size=len(train_ds.vocab),
                emb_dim=params["emb_dim"],
                hidden_dim=params["hidden_dim"],
                num_layers=params["num_layers"],
                bidirectional=params["bidirectional"],
                dropout=params["dropout"],
                pad_idx=0,
                pretrained_embeddings=glove_tensor
            ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
    criterion = nn.BCEWithLogitsLoss()

    # Early stopping
    patience = params.get("patience", 2)
    best_acc, best_f1, best_epoch = 0.0, 0.0, None
    patience_ctr = 0

    for epoch in range(1, params["epochs"] + 1):
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, criterion)
        va_loss, va_acc, y_true, y_pred = eval_epoch(model, val_loader, criterion)
        va_f1 = f1_score(y_true, y_pred)

        tqdm.write(f"[Fold] Ep{epoch:02d} | tr_loss={tr_loss:.4f} tr_acc={tr_acc:.4f} "
                   f"| va_loss={va_loss:.4f} va_acc={va_acc:.4f} va_f1={va_f1:.4f}")

        if va_acc > best_acc:
            best_acc = va_acc
            best_f1  = va_f1
            best_epoch = epoch
            patience_ctr = 0
        else:
            patience_ctr += 1
            if patience_ctr >= patience:
                tqdm.write(f"Early stop @ epoch {epoch} (best acc={best_acc:.4f} f1={best_f1:.4f})")
                break

    return {"val_acc": best_acc, "val_f1": best_f1, "best_epoch": best_epoch}

In [9]:
def kfold_lstm_cv(texts, labels, param_grid, glove_filename="glove.6B.100d.txt", n_splits=5, seed=42):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    X = np.array(texts, dtype=object)
    y = np.array(labels, dtype=int)

    all_runs = []
    for params in param_grid:
        fold_metrics = []
        tqdm.write(f"\n=== Params: {params} ===")
        for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y), start=1):
            tqdm.write(f"\n--- Fold {fold}/{n_splits} ---")
            set_seed(42 + fold)  # fold-stable seed
            m = train_lstm_one_fold(params, tr_idx, va_idx, texts, labels, glove_filename)
            fold_metrics.append(m)

        mean_acc = float(np.mean([m["val_acc"] for m in fold_metrics]))
        std_acc  = float(np.std([m["val_acc"] for m in fold_metrics]))
        mean_f1  = float(np.mean([m["val_f1"] for m in fold_metrics]))
        std_f1   = float(np.std([m["val_f1"] for m in fold_metrics]))

        all_runs.append({
            "params": params,
            "mean_acc": mean_acc, "std_acc": std_acc,
            "mean_f1": mean_f1,   "std_f1": std_f1,
        })

        tqdm.write(f"\nRESULT | acc={mean_acc:.4f}±{std_acc:.4f} | f1={mean_f1:.4f}±{std_f1:.4f}")

    all_runs = sorted(all_runs, key=lambda d: (d["mean_f1"], d["mean_acc"]), reverse=True)
    return all_runs

In [10]:
texts = df["review"].astype(str).tolist()
labels = df["sentiment"].astype(int).tolist()

# Small param grid to start (expand once it runs fine)
param_grid = [
    {
        "emb_dim": 100,          # must match your GloVe file (e.g., glove.6B.100d.txt)
        "hidden_dim": 128,
        "num_layers": 2,
        "bidirectional": True,
        "dropout": 0.4,
        "lr": 1e-3,
        "batch_size": 64,
        "epochs": 8,
        "max_len": 256,
        "min_freq": 2,
        "max_vocab_size": 30000,
        "patience": 2,
    },
    {
        "emb_dim": 100,
        "hidden_dim": 256,
        "num_layers": 2,
        "bidirectional": True,
        "dropout": 0.5,
        "lr": 5e-4,
        "batch_size": 64,
        "epochs": 8,
        "max_len": 256,
        "min_freq": 2,
        "max_vocab_size": 30000,
        "patience": 2,
    },
]

# Run CV
results = kfold_lstm_cv(
    texts=X,
    labels=y,
    param_grid=param_grid,
    glove_filename="glove.6B.100d.txt",  # file must be in <project_root>/data
    n_splits=5,
    seed=42
)

# Leaderboard
pd.DataFrame(results)


=== Params: {'emb_dim': 100, 'hidden_dim': 128, 'num_layers': 2, 'bidirectional': True, 'dropout': 0.4, 'lr': 0.001, 'batch_size': 64, 'epochs': 8, 'max_len': 256, 'min_freq': 2, 'max_vocab_size': 30000, 'patience': 2} ===

--- Fold 1/5 ---
[Fold] Ep01 | tr_loss=0.5953 tr_acc=0.6805 | va_loss=0.4347 va_acc=0.8027 va_f1=0.7934
[Fold] Ep02 | tr_loss=0.3932 tr_acc=0.8293 | va_loss=0.3630 va_acc=0.8424 va_f1=0.8362
[Fold] Ep03 | tr_loss=0.3396 tr_acc=0.8545 | va_loss=0.3365 va_acc=0.8572 va_f1=0.8631
[Fold] Ep04 | tr_loss=0.3105 tr_acc=0.8683 | va_loss=0.3240 va_acc=0.8604 va_f1=0.8540
[Fold] Ep05 | tr_loss=0.2861 tr_acc=0.8806 | va_loss=0.2977 va_acc=0.8752 va_f1=0.8769
[Fold] Ep06 | tr_loss=0.2682 tr_acc=0.8902 | va_loss=0.3197 va_acc=0.8705 va_f1=0.8643
[Fold] Ep07 | tr_loss=0.2474 tr_acc=0.8989 | va_loss=0.2774 va_acc=0.8843 va_f1=0.8866
[Fold] Ep08 | tr_loss=0.2270 tr_acc=0.9092 | va_loss=0.2980 va_acc=0.8819 va_f1=0.8868

--- Fold 2/5 ---
[Fold] Ep01 | tr_loss=0.5924 tr_acc=0.6938 |

Unnamed: 0,params,mean_acc,std_acc,mean_f1,std_f1
0,"{'emb_dim': 100, 'hidden_dim': 128, 'num_layer...",0.883829,0.00656,0.884136,0.006506
1,"{'emb_dim': 100, 'hidden_dim': 256, 'num_layer...",0.879271,0.004578,0.879814,0.006981


In [11]:
model_name = "distilbert-base-uncased"  # faster than BERT-base
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)

def train_eval_one_fold(train_idx, val_idx, hp):
    tr_texts = [texts[i] for i in train_idx]; tr_labels = [labels[i] for i in train_idx]
    va_texts = [texts[i] for i in val_idx];   va_labels = [labels[i] for i in val_idx]

    train_ds = Dataset.from_dict({"text": tr_texts, "label": tr_labels}).map(tokenize_batch, batched=True)
    val_ds   = Dataset.from_dict({"text": va_texts, "label": va_labels}).map(tokenize_batch, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    args = TrainingArguments(
        output_dir="tmp_out",
        learning_rate=hp["lr"],
        per_device_train_batch_size=hp["bsz"],
        per_device_eval_batch_size=hp["bsz"],
        num_train_epochs=hp["epochs"],
        weight_decay=hp["wd"],
        eval_strategy="epoch",
        disable_tqdm=False,
        save_strategy="no",
        log_level="error",
        log_level_replica="error",
        logging_steps=50,
        report_to=[],
        fp16=torch.cuda.is_available(),
    )

    def compute_metrics(eval_pred):
        logits, y_true = eval_pred
        y_pred = np.argmax(logits, axis=1)
        return {"accuracy": (y_pred == y_true).mean(), "f1": f1_score(y_true, y_pred)}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    out = trainer.evaluate()
    preds = trainer.predict(val_ds)
    y_pred = np.argmax(preds.predictions, axis=1)
    y_true = np.array(va_labels)
    return {"eval_accuracy": out["eval_accuracy"], "eval_f1": out["eval_f1"], "y_true": y_true, "y_pred": y_pred}


In [13]:
def expand_grid(grid_dict):
    keys = list(grid_dict.keys())
    vals = list(grid_dict.values())
    return [dict(zip(keys, combo)) for combo in product(*vals)]

grid = {
    "lr":   [2e-5, 3e-5, 5e-5],
    "bsz":  [16, 32, 64],
    "epochs": [2, 3, 4],        
    "wd":   [0.01],
}
hp_list = expand_grid(grid)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_rows = []           # one row per (hp, fold)
agg_preds = {}          # (hp_idx) -> list of (y_true, y_pred) for confusion matrices

for hp_idx, hp in enumerate(hp_list):
    agg_preds[hp_idx] = []
    for fold_id, (tr_idx, va_idx) in enumerate(cv.split(X, y), start=1):
        m = train_eval_one_fold(tr_idx, va_idx, hp)
        all_rows.append({
            "hp_idx": hp_idx,
            "fold": fold_id,
            "lr": hp["lr"],
            "bsz": hp["bsz"],
            "epochs": hp["epochs"],
            "wd": hp["wd"],
            "acc": m["eval_accuracy"],
            "f1":  m["eval_f1"],
        })
        agg_preds[hp_idx].append( (m["y_true"], m["y_pred"]) )

results_df = pd.DataFrame(all_rows)

Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2545,0.2828,0.8916,0.885723
2,0.1614,0.274318,0.910154,0.910533


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2627,0.22422,0.911667,0.91008
2,0.122,0.23545,0.918927,0.920016


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.265,0.233508,0.907322,0.90554
2,0.1928,0.258672,0.914381,0.91475


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2568,0.234258,0.905708,0.906993
2,0.1926,0.272737,0.913372,0.913677


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2269,0.259744,0.896934,0.894181
2,0.1474,0.293006,0.910549,0.910917


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2588,0.278648,0.897247,0.892928
2,0.1898,0.285794,0.908339,0.907387
3,0.108,0.36326,0.907734,0.908252


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2608,0.227579,0.910558,0.90924
2,0.1139,0.266331,0.9157,0.917862
3,0.1545,0.299093,0.91923,0.919988


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.265,0.245491,0.902783,0.899247
2,0.2094,0.250132,0.911053,0.913682
3,0.0912,0.338831,0.914078,0.914851


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2466,0.237161,0.905002,0.904675
2,0.1975,0.268524,0.910549,0.911451
3,0.1173,0.35176,0.912566,0.912415


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2345,0.247295,0.903792,0.903167
2,0.1615,0.289886,0.909238,0.908088
3,0.1092,0.360705,0.910448,0.911022


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.267,0.266386,0.898861,0.895117
2,0.1791,0.290193,0.908642,0.907267
3,0.0932,0.388416,0.908642,0.909
4,0.0422,0.431855,0.908339,0.909164


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2614,0.22925,0.911869,0.90958
2,0.1209,0.252833,0.9157,0.917244
3,0.1536,0.291754,0.917415,0.917481
4,0.0263,0.386452,0.917415,0.917779


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2677,0.247687,0.902279,0.898587
2,0.2068,0.24367,0.91307,0.915057
3,0.097,0.356659,0.913574,0.914085
4,0.0703,0.39746,0.914381,0.914802


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.257,0.242126,0.900464,0.902749
2,0.2025,0.262015,0.911355,0.910987
3,0.1203,0.367274,0.910246,0.910714
4,0.05,0.413232,0.910549,0.911256


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2459,0.247255,0.903691,0.903232
2,0.1761,0.28334,0.906212,0.904341
3,0.1134,0.385961,0.908633,0.910669
4,0.0653,0.433559,0.910044,0.910996


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2708,0.285453,0.881718,0.873993
2,0.1777,0.240032,0.908339,0.908817


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2676,0.228626,0.909045,0.907525
2,0.2088,0.216253,0.917011,0.918052


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.238,0.233925,0.906515,0.905166
2,0.1716,0.233192,0.914582,0.91519


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2701,0.235636,0.905305,0.90639
2,0.1714,0.245688,0.91186,0.912178


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2404,0.283438,0.885841,0.879446
2,0.1528,0.251325,0.906918,0.907283


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2768,0.347695,0.859433,0.843476
2,0.1898,0.247052,0.909247,0.911417
3,0.0988,0.281545,0.911163,0.912556


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2776,0.2345,0.90723,0.904167
2,0.2043,0.217394,0.917818,0.918459
3,0.1413,0.247125,0.918725,0.919464


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2426,0.230585,0.907322,0.906311
2,0.1726,0.228972,0.913574,0.913722
3,0.1011,0.279462,0.912364,0.913178


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.268,0.240896,0.903893,0.900884
2,0.1743,0.24786,0.911053,0.912932
3,0.1009,0.289125,0.912666,0.912348


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2441,0.28727,0.884328,0.87705
2,0.1514,0.258749,0.90954,0.909256
3,0.1055,0.3002,0.907725,0.907994


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2696,0.338635,0.866794,0.853564
2,0.2028,0.243769,0.909852,0.911555
3,0.0988,0.288172,0.910356,0.912024
4,0.0681,0.362849,0.908642,0.909599


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2715,0.227354,0.907331,0.90515
2,0.1975,0.226857,0.914994,0.917264
3,0.1426,0.244511,0.914591,0.914591
4,0.0807,0.314403,0.916406,0.917025


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2443,0.228662,0.908027,0.907505
2,0.1774,0.228913,0.910952,0.911265
3,0.1047,0.287256,0.913171,0.914643
4,0.0782,0.33939,0.912263,0.913208


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2744,0.249915,0.900161,0.896292
2,0.1896,0.23441,0.911456,0.911777
3,0.1045,0.304083,0.906616,0.909269
4,0.0721,0.338593,0.910246,0.910571


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2467,0.280228,0.885841,0.879034
2,0.1514,0.252587,0.908532,0.90759
3,0.1002,0.305161,0.908935,0.910514
4,0.0622,0.367791,0.906515,0.907272


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2599,0.252106,0.896743,0.895914
2,0.1972,0.243482,0.905516,0.905971


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.266,0.250591,0.898256,0.893442
2,0.2025,0.220574,0.91328,0.914137


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2429,0.248448,0.898649,0.894577
2,0.1949,0.228554,0.911658,0.912084


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2721,0.27858,0.886244,0.893404
2,0.1675,0.240225,0.90712,0.907743


Map:   0%|          | 0/39666 [00:00<?, ? examples/s]

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2606,0.25273,0.897035,0.895635
2,0.1901,0.240979,0.903086,0.903871


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2645,0.261105,0.892407,0.888307
2,0.1924,0.245127,0.908843,0.908981
3,0.1262,0.257777,0.907533,0.908125


Map:   0%|          | 0/39665 [00:00<?, ? examples/s]

Map:   0%|          | 0/9917 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2558,0.227808,0.909045,0.90779
2,0.1925,0.215698,0.913079,0.914382


In [None]:
summary = (
    results_df
    .groupby(["hp_idx","lr","bsz","epochs","wd"], as_index=False)
    .agg(mean_acc=("acc","mean"), std_acc=("acc","std"),
         mean_f1=("f1","mean"),   std_f1=("f1","std"))
    .sort_values(["mean_f1","mean_acc"], ascending=False)
    .reset_index(drop=True)
)

display(summary)

# ---- pick best & worst by mean_f1
best_row  = summary.iloc[0]
worst_row = summary.iloc[-1]
best_hp_idx  = int(best_row["hp_idx"])
worst_hp_idx = int(worst_row["hp_idx"])

print("Best HP:",  dict(best_row[["lr","bsz","epochs","wd"]]))
print("Worst HP:", dict(worst_row[["lr","bsz","epochs","wd"]]))

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(results_df.loc[results_df.hp_idx==best_hp_idx,"fold"],
        results_df.loc[results_df.hp_idx==best_hp_idx,"acc"], marker="o", label="best (acc)")
ax.plot(results_df.loc[results_df.hp_idx==worst_hp_idx,"fold"],
        results_df.loc[results_df.hp_idx==worst_hp_idx,"acc"], marker="o", label="worst (acc)")
ax.set_title("Per-fold accuracy: best vs worst hyperparams")
ax.set_xlabel("fold"); ax.set_ylabel("accuracy"); ax.set_xticks([1,2,3,4,5]); ax.legend()
plt.tight_layout(); plt.show()

# ---- aggregate confusion matrices (sum over folds) for best vs worst
def sum_conf_mat(pairs):
    cm_sum = np.zeros((2,2), dtype=int)
    for y_true, y_pred in pairs:
        cm_sum += confusion_matrix(y_true, y_pred, labels=[0,1])
    return cm_sum

cm_best  = sum_conf_mat(agg_preds[best_hp_idx])
cm_worst = sum_conf_mat(agg_preds[worst_hp_idx])

In [None]:
fig, axes = plt.subplots(1,2, figsize=(10,4))
for ax, cm, title in [(axes[0], cm_best,  "Best HP (sum over folds)"),
                      (axes[1], cm_worst, "Worst HP (sum over folds)")]:
    disp = ConfusionMatrixDisplay(cm, display_labels=["neg","pos"])
    disp.plot(ax=ax, cmap="Blues", values_format="d", colorbar=False)
    ax.set_title(title)
plt.tight_layout(); plt.show()