<a href="https://colab.research.google.com/github/Shobini12/Checkers/blob/main/hpml_hw2_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets torch wandb matplotlib pandas seaborn
!pip install -q accelerate

In [None]:
import os
import torch
import transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
import wandb
from tqdm.auto import tqdm
import random

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
print("=" * 60)
print("ENVIRONMENT INFORMATION")
print("=" * 60)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
else:
    print("Running on CPU")
print(f"Transformers Version: {transformers.__version__}")
print("=" * 60)

ENVIRONMENT INFORMATION
PyTorch Version: 2.8.0+cu126
CUDA Available: True
CUDA Version: 12.6
GPU Device: Tesla T4
Transformers Version: 4.57.1


In [None]:
config = {
    "model_name": "distilbert-base-uncased",
    "max_len": 256,
    "batch_size": 32,
    "lr": 1e-4,
    "optimizer": "AdamW",
    "num_workers": 2,
    "epochs": 5,  # Can change to 10 if desired
    "compile_mode": False,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

In [None]:
print("Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

Configuration:
  model_name: distilbert-base-uncased
  max_len: 256
  batch_size: 32
  lr: 0.0001
  optimizer: AdamW
  num_workers: 2
  epochs: 5
  compile_mode: False
  device: cuda


In [None]:
wandb.login()
wandb.init(project="hpml-hw2-llm", name="baseline-run")
wandb.config.update(config)

[34m[1mwandb[0m: Currently logged in as: [33miyer-shobini[0m ([33miyer-shobini-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

Loading IMDB dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train samples: 25000
Test samples: 25000


In [14]:
tokenizer = DistilBertTokenizer.from_pretrained(config["model_name"])

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=config["max_len"]
    )

print("Tokenizing dataset...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

tokenized_datasets.set_format("torch")

print("done")

Tokenizing dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

done


In [15]:
def collate_fn(batch):
    # Extract input_ids, attention_mask, and labels
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = torch.tensor([item['label'] for item in batch])

    # Dynamic padding
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_padded = torch.nn.utils.rnn.pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'labels': labels
    }

In [16]:
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=config["batch_size"],
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=config["num_workers"]
)

test_dataloader = DataLoader(
    tokenized_datasets["test"],
    batch_size=config["batch_size"],
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=config["num_workers"]
)

print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of test batches: {len(test_dataloader)}")

Number of training batches: 782
Number of test batches: 782


In [17]:
device = torch.device(config["device"])
model = DistilBertForSequenceClassification.from_pretrained(
    config["model_name"],
    num_labels=2
)
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
optimizer = AdamW(model.parameters(), lr=config["lr"])
num_training_steps = len(train_dataloader) * config["epochs"]
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [19]:
#C1 Implementation
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(dataloader, desc="Training")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total

    return avg_loss, accuracy

In [20]:
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

In [None]:
print("\n" + "="*60)
print("C1: Fine-tuning DistilBERT on IMDB")
print("="*60 + "\n")

# Storage for metrics
metrics_data = []

for epoch in range(config["epochs"]):
    print(f"\nEpoch {epoch + 1}/{config['epochs']}")
    print("-" * 40)

    # Train
    train_loss, train_acc = train_epoch(
        model, train_dataloader, optimizer, scheduler, device
    )

    # Evaluate
    test_acc = evaluate(model, test_dataloader, device)

    # Store metrics
    metrics_data.append({
        'Epoch': epoch + 1,
        'Train Loss': train_loss,
        'Train Acc': train_acc,
        'Test Acc': test_acc
    })

    # Log to W&B
    wandb.log({
        'epoch': epoch + 1,
        'train/loss': train_loss,
        'train/accuracy': train_acc,
        'test/accuracy': test_acc
    })

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Train Acc: {train_acc:.4f}")
    print(f"Test Acc: {test_acc:.4f}")



C1: Fine-tuning DistilBERT on IMDB


Epoch 1/5
----------------------------------------


Training:   0%|          | 0/782 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/782 [00:00<?, ?it/s]

Train Loss: 0.3027
Train Acc: 0.8720
Test Acc: 0.9032

Epoch 2/5
----------------------------------------


Training:   0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
df_metrics = pd.DataFrame(metrics_data)
print("\n" + "="*60)
print("Table T1: Per-Epoch Metrics")
print("="*60)
print(df_metrics.to_string(index=False))

# Log table to W&B
wandb.log({"metrics_table": wandb.Table(dataframe=df_metrics)})

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Training Loss
ax1.plot(df_metrics['Epoch'], df_metrics['Train Loss'],
         marker='o', linewidth=2, markersize=8, color='#e74c3c')
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Training Loss', fontsize=12)
ax1.set_title('Training Loss vs Epoch', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.set_xticks(df_metrics['Epoch'])

# Training Accuracy
ax2.plot(df_metrics['Epoch'], df_metrics['Train Acc'],
         marker='s', linewidth=2, markersize=8, color='#3498db', label='Train Acc')
ax2.plot(df_metrics['Epoch'], df_metrics['Test Acc'],
         marker='^', linewidth=2, markersize=8, color='#2ecc71', label='Test Acc')
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Accuracy', fontsize=12)
ax2.set_title('Accuracy vs Epoch', fontsize=14, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)
ax2.set_xticks(df_metrics['Epoch'])

plt.tight_layout()
plt.savefig('c1_training_metrics.png', dpi=150, bbox_inches='tight')
wandb.log({"C1_training_plot": wandb.Image(fig)})
plt.show()

print("\n✓ C1 Complete!")
print(f"Final Test Accuracy: {df_metrics['Test Acc'].iloc[-1]:.4f}")

In [None]:
wandb.finish()