Imports & load CSVs

In [2]:
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import os

PROJECT_ROOT = Path("/Users/sayalisawant/Projects/imdb-sentiment-analysis")
os.chdir(PROJECT_ROOT)

print("CWD:", Path.cwd())
print("Has data/train.csv? ", (PROJECT_ROOT / "data/train.csv").exists())

DATA_DIR = Path("data")
train_df = pd.read_csv(DATA_DIR/"train.csv")
valid_df = pd.read_csv(DATA_DIR/"valid.csv")
test_df  = pd.read_csv(DATA_DIR/"test.csv")

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "valid": Dataset.from_pandas(valid_df),
    "test":  Dataset.from_pandas(test_df),
})
len(ds["train"]), len(ds["valid"]), len(ds["test"])


  from .autonotebook import tqdm as notebook_tqdm


CWD: /Users/sayalisawant/Projects/imdb-sentiment-analysis
Has data/train.csv?  True


(20000, 5000, 25000)

Tokenize

In [3]:
model_ckpt = "distilbert-base-uncased"
tok = AutoTokenizer.from_pretrained(model_ckpt)

def preprocess(batch):
    return tok(batch["review"], truncation=True, max_length=256)

encoded = ds.map(preprocess, batched=True, remove_columns=["review"])
encoded = encoded.rename_column("label", "labels")
encoded.set_format(type="torch", columns=["input_ids","attention_mask","labels"])


Map: 100%|██████████| 20000/20000 [00:15<00:00, 1276.05 examples/s]
Map: 100%|██████████| 5000/5000 [00:03<00:00, 1277.16 examples/s]
Map: 100%|██████████| 25000/25000 [00:16<00:00, 1522.91 examples/s]


Metrics

In [4]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    import numpy as np
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds)}


In [5]:
import transformers, inspect
from transformers import TrainingArguments
print("Transformers:", transformers.__version__)
print("TrainingArguments from:", TrainingArguments.__module__)
print("Signature:", inspect.signature(TrainingArguments.__init__))


Transformers: 4.56.1
TrainingArguments from: transformers.training_args


In [1]:
import torch, numpy as np
print("torch:", torch.__version__, "numpy:", np.__version__)

torch: 2.2.2 numpy: 1.26.4


Model & Trainer

In [8]:
from pathlib import Path
from transformers import (AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)

out_dir = Path("models/distilbert-imdb-full")

# 1) Recreate model/collator (same as before)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2)
collator = DataCollatorWithPadding(tokenizer=tok)

# 2) Small subsets (fast)
small_train = encoded["train"].shuffle(seed=42).select(range(1000))
small_valid = encoded["valid"].shuffle(seed=42).select(range(200))

# 3) Frequent eval/save so we get checkpoints
args = TrainingArguments(
    output_dir=str(out_dir),
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)
trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2),
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["valid"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.train()  # creates models/distilbert-imdb/checkpoint-XXXX


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2668,0.285986,0.9016,0.901284
2,0.1822,0.286091,0.9082,0.908255


TrainOutput(global_step=2500, training_loss=0.2387313259124756, metrics={'train_runtime': 26370.2561, 'train_samples_per_second': 1.517, 'train_steps_per_second': 0.095, 'total_flos': 2649347973120000.0, 'train_loss': 0.2387313259124756, 'epoch': 2.0})

Test set evaluation

In [9]:
test_metrics = trainer.evaluate(encoded["test"])
test_metrics
trainer.save_model(str(out_dir))
tok.save_pretrained(str(out_dir))
print("Saved to:", out_dir)

Saved to: models/distilbert-imdb-full


In [11]:
from pathlib import Path
import json

# Make sure these match your trained run
out_dir = Path("models/distilbert-imdb-full")
exper_dir = Path("experiments")
exper_dir.mkdir(exist_ok=True)

# Save final model + tokenizer
trainer.save_model(str(out_dir))   # saves config.json + pytorch_model.bin
tok.save_pretrained(str(out_dir))  # saves tokenizer files

# (Optional) Save your test metrics for README/app display
with open(exper_dir / "distilbert_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

print("✅ Artifacts saved to:", out_dir)
print("✅ Metrics saved to:", exper_dir / "distilbert_metrics.json")



✅ Artifacts saved to: models/distilbert-imdb-full
✅ Metrics saved to: experiments/distilbert_metrics.json
