In [1]:
pip install -U transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloa

In [2]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)

2025-08-13 07:37:45.195857: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755070665.559249      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755070665.663728      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
DATA_CSV = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
WORKDIR = Path("/kaggle/working/imdb_run")
WORKDIR.mkdir(parents=True, exist_ok=True)

In [4]:
SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MODEL_CANDIDATES = [
    "bert-base-uncased",
    "roberta-base",
    "microsoft/deberta-base",
    "google/electra-base-discriminator",
    "distilbert-base-uncased",
]

In [5]:
SUB_TRAIN = 10_000
SUB_VAL = 2_000
MAX_LEN = 256
SMALL_BS = 16
SMALL_EPOCHS = 2

# Full training on best model
FULL_BS = 16
FULL_EPOCHS = 3

In [6]:
# Data Prep
def load_imdb_dataframe(csv_path: str) -> pd.DataFrame:
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
    else:
        # Fallback if running locally without Kaggle dataset path; keeps code robust.
        from datasets import load_dataset
        raw = load_dataset("imdb")
        df = pd.concat(
            [
                raw["train"].to_pandas().assign(split="train"),
                raw["test"].to_pandas().assign(split="test"),
            ],
            ignore_index=True,
        ).rename(columns={"text": "review"})
        
        df = df.sample(n=50_000, random_state=SEED) if len(df) > 50_000 else df
        df["sentiment"] = df["label"].map({0: "negative", 1: "positive"})

    df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
    return df[["review", "label"]].sample(frac=1, random_state=SEED).reset_index(drop=True)

df = load_imdb_dataframe(DATA_CSV)

train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=SEED
)
train_df, val_df = train_test_split(
    train_df, test_size=0.125, stratify=train_df["label"], random_state=SEED
)  

def to_hf_splits(train_df, val_df, test_df) -> DatasetDict:
    return DatasetDict(
        {
            "train": Dataset.from_pandas(train_df, preserve_index=False),
            "validation": Dataset.from_pandas(val_df, preserve_index=False),
            "test": Dataset.from_pandas(test_df, preserve_index=False),
        }
    )

ds = to_hf_splits(train_df, val_df, test_df)

In [7]:
#  Tokenization 
def build_tokenizer(model_id: str):
    return AutoTokenizer.from_pretrained(model_id, use_fast=True)

def tokenization_fn(examples, tok, max_len=MAX_LEN):
    return tok(examples["review"], padding="max_length", truncation=True, max_length=max_len)

def prepare_for_trainer(dataset: Dataset, tokenizer) -> Dataset:
    tokenized = dataset.map(lambda x: tokenization_fn(x, tokenizer), batched=True, remove_columns=["review"])
    tokenized.set_format("torch")
    return tokenized

In [8]:
#  Metrics 
def binary_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = np.argmax(eval_pred.predictions, axis=1)
    return {
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
    }

In [9]:
#  Training Helper 
def train_once(
    model_id: str,
    train_set: Dataset,
    val_set: Dataset,
    outdir: Path,
    per_device_bs: int,
    epochs: int,
) -> tuple[dict, Trainer]:
    tokenizer = build_tokenizer(model_id)
    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    token_train = prepare_for_trainer(train_set, tokenizer)
    token_val = prepare_for_trainer(val_set, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    # AFTER
    args = TrainingArguments(
        output_dir=str(outdir),
        eval_strategy="epoch",         # <-- compatible with older versions
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=per_device_bs,
        per_device_eval_batch_size=per_device_bs,
        num_train_epochs=epochs,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        save_total_limit=1,
        seed=SEED,
        report_to="none",
    )


    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=token_train,
        eval_dataset=token_val,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=binary_metrics,
    )

    trainer.train()
    metrics = trainer.evaluate()
    trainer.save_model(str(outdir))
    return metrics, trainer

In [10]:
# Create smaller, shuffled slices for a fast ranking of models
small_train = ds["train"].shuffle(seed=SEED).select(range(min(SUB_TRAIN, len(ds["train"]))))
small_val = ds["validation"].shuffle(seed=SEED).select(range(min(SUB_VAL, len(ds["validation"]))))

scores = {}
for model_id in MODEL_CANDIDATES:
    print(f"\n>>> Quick run on subset: {model_id}")
    out_dir = WORKDIR / "compare_subset" / model_id.replace("/", "_")
    out_dir.mkdir(parents=True, exist_ok=True)

    eval_metrics, _ = train_once(
        model_id=model_id,
        train_set=small_train,
        val_set=small_val,
        outdir=out_dir,
        per_device_bs=SMALL_BS,
        epochs=SMALL_EPOCHS,
    )

    f1_val = eval_metrics.get("eval_f1", float("-inf"))
    scores[model_id] = f1_val
    print(f"{model_id} -> eval_f1: {f1_val:.4f}")

# Pick the best model
best_model = max(scores.items(), key=lambda kv: kv[1])[0]
print("\nBest model based on subset F1:", best_model)


>>> Quick run on subset: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3158,0.253628,0.895055,0.896866,0.893253
2,0.1377,0.329352,0.899901,0.885854,0.914401




bert-base-uncased -> eval_f1: 0.8999

>>> Quick run on subset: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2983,0.241242,0.912863,0.941176,0.886203
2,0.1488,0.265899,0.914659,0.911912,0.917422




roberta-base -> eval_f1: 0.9147

>>> Quick run on subset: microsoft/deberta-base


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2903,0.207332,0.918753,0.932573,0.905337
2,0.1317,0.254964,0.925163,0.922846,0.927492




microsoft/deberta-base -> eval_f1: 0.9252

>>> Quick run on subset: google/electra-base-discriminator


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2672,0.211452,0.921964,0.946921,0.898288
2,0.1215,0.283953,0.935208,0.932866,0.937563




google/electra-base-discriminator -> eval_f1: 0.9352

>>> Quick run on subset: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3354,0.273139,0.884417,0.906878,0.863041
2,0.1579,0.297456,0.894236,0.89022,0.898288




distilbert-base-uncased -> eval_f1: 0.8942

Best model based on subset F1: google/electra-base-discriminator


In [11]:
#  Full training with the best model 
final_dir = WORKDIR / "winner_full" / best_model.replace("/", "_")
final_dir.mkdir(parents=True, exist_ok=True)

full_val_metrics, full_trainer = train_once(
    model_id=best_model,
    train_set=ds["train"],
    val_set=ds["validation"],
    outdir=final_dir,
    per_device_bs=FULL_BS,
    epochs=FULL_EPOCHS,
)
print("\nValidation metrics (full training):", full_val_metrics)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2175,0.209777,0.930411,0.895633,0.968
2,0.1302,0.348567,0.940455,0.930333,0.9508
3,0.1698,0.880689,0.941247,0.937326,0.9452





Validation metrics (full training): {'eval_loss': 0.8806890845298767, 'eval_f1': 0.9412467635929098, 'eval_precision': 0.9373264577548592, 'eval_recall': 0.9452, 'eval_runtime': 47.8492, 'eval_samples_per_second': 104.495, 'eval_steps_per_second': 3.281, 'epoch': 3.0}


In [12]:
#  Test evaluation 
tok = build_tokenizer(best_model)
token_test = ds["test"].map(lambda x: tokenization_fn(x, tok), batched=True, remove_columns=["review"])
token_test.set_format("torch")
test_metrics = full_trainer.evaluate(eval_dataset=token_test)
print("\nTest metrics:", test_metrics)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]




Test metrics: {'eval_loss': 0.845896303653717, 'eval_f1': 0.9422655783396376, 'eval_precision': 0.937970669837495, 'eval_recall': 0.9466, 'eval_runtime': 95.7328, 'eval_samples_per_second': 104.457, 'eval_steps_per_second': 3.27, 'epoch': 3.0}


In [13]:
#  Inspect a few predictions 
N_SAMPLES = 10
sample_idxs = random.sample(range(len(ds["test"])), N_SAMPLES)
samples = [ds["test"][i] for i in sample_idxs]

enc = tok([s["review"] for s in samples], truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loaded_model = AutoModelForSequenceClassification.from_pretrained(str(final_dir)).to(device)
enc = {k: v.to(device) for k, v in enc.items()}

with torch.no_grad():
    logits = loaded_model(**enc).logits
    pred_ids = logits.argmax(dim=-1).cpu().numpy()

print("\n Sample predictions on test set")
for i, s in enumerate(samples):
    text_snippet = s["review"][:200].replace("\n", " ")
    print(f"\nReview: {text_snippet}...")
    print(f"True: {s['label']} | Pred: {int(pred_ids[i])}")



 Sample predictions on test set

Review: This made-for-TV film is a brilliant one. This is probably the best and favourite role by BAFTA winning John Thaw (Kavanagh Q.C. and Inspector Morse). Tom Oakley (Thaw) widowed man has lived in a vill...
True: 1 | Pred: 1

Review: !!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this film had an ending TO spoil... I only started watching it in the middle, after Matt had gotten into Sarah's body, but the...
True: 0 | Pred: 1

Review: First off, let me start with a quote a friend of mine said while watching this movie: "This entire movie had to have been a dare. You know, like, 'DUDE, I BET YOU COULDN'T MAKE THE WORST MOVIE EVER'"....
True: 0 | Pred: 0

Review: This is a candidate for worst films I've ever seen. It wanted to be as shocking as "Silence of the Lambs," but has neither the style nor the wit of the aforementioned. The make-up is excellent, the ac...
True: 0 | Pred: 0

Review: One of the most popular rentals at