In [None]:
!pip install -q "transformers>=4.41.0" "datasets>=2.18.0" peft accelerate bitsandbytes optuna wandb evaluate scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, gc, time, math
import torch
import numpy as np
import evaluate
import wandb
import optuna
from dataclasses import dataclass
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    GPT2ForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)


In [None]:
torch.cuda.empty_cache(); gc.collect()

wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msarvagyatayal[0m ([33mBanking77[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:




MODEL_NAME = "microsoft/DialoGPT-medium"   # do not change per your request
QUANT_TYPE = "nf4"                          # options: "nf4" or "int4"
MAX_LEN     = 128
N_TRIALS    = 5                             # reduce if you’re in a hurry
PROJECT     = "banking77-qlora-dialogpt"
RUN_TAG     = f"QLoRA-{QUANT_TYPE.upper()}"

print("Torch CUDA:", torch.version.cuda, "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Torch CUDA: 12.6 | GPU: Tesla T4


In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type=QUANT_TYPE,      # "nf4" or "int4"
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
bnb_config


BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "float16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path_prefix = "/content/drive/MyDrive/Banking77_Project/data/"
data_files = {
    "train": os.path.join(data_path_prefix, "train.csv"),
    "validation": os.path.join(data_path_prefix, "validation.csv"),
    "test": os.path.join(data_path_prefix, "test.csv"),
}

# First, let's check what columns exist in the CSV files
print("Loading dataset and inspecting structure...")
dataset = load_dataset("csv", data_files=data_files)

# Print column names for each split to see what we're working with
for split in ["train", "validation", "test"]:
    print(f"\n{split.upper()} split columns: {dataset[split].column_names}")
    if len(dataset[split]) > 0:
        print(f"First row sample: {dict(list(dataset[split][0].items()))}")

# Based on typical Banking77 structure, the columns might be:
# Option 1: "text" and "label"
# Option 2: "text" and "intent" (what you expected)
# Option 3: "question" and "category" or similar

# Let's check what column contains the labels
label_column = None
possible_label_columns = ["intent", "label", "category", "class"]

for col in possible_label_columns:
    if col in dataset["train"].column_names:
        label_column = col
        break

if label_column is None:
    # If none of the expected names exist, use the first non-text column
    text_columns = ["text", "question", "sentence", "utterance"]
    for col in dataset["train"].column_names:
        if col not in text_columns:
            label_column = col
            break

if label_column is None:
    raise ValueError("Could not identify label column in the dataset")

print(f"\nUsing '{label_column}' as the label column")

# Get text column name
text_column = None
possible_text_columns = ["text", "question", "sentence", "utterance"]

for col in possible_text_columns:
    if col in dataset["train"].column_names:
        text_column = col
        break

if text_column is None:
    # Use the first column that's not the label column
    for col in dataset["train"].column_names:
        if col != label_column:
            text_column = col
            break

if text_column is None:
    raise ValueError("Could not identify text column in the dataset")

print(f"Using '{text_column}' as the text column")

# Get label names and mappings
label_names = sorted(list(set(dataset["train"][label_column])))
label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(label_names)

print(f"\n#labels: {num_labels}")
print(f"Label examples: {label_names[:5]}...")  # Show first 5 labels
print(f"Dataset structure: {dataset}")

# Store the column names for use in preprocessing
TEXT_COLUMN = text_column
LABEL_COLUMN = label_column

Loading dataset and inspecting structure...

TRAIN split columns: ['text', 'label']
First row sample: {'text': 'Will you send me a new card in China?', 'label': 9}

VALIDATION split columns: ['text', 'label']
First row sample: {'text': 'I made a purchase with my card and I was charged a fee for using it', 'label': 15}

TEST split columns: ['text', 'label']
First row sample: {'text': 'How do I locate my card?', 'label': 11}

Using 'label' as the label column
Using 'text' as the text column

#labels: 77
Label examples: [0, 1, 2, 3, 4]...
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9002
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1001
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with proper configuration for training
base_model = GPT2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    pad_token_id=tokenizer.eos_token_id,
)

# Prepare model for k-bit training
base_model = prepare_model_for_kbit_training(base_model)

# Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # GPT-2/DialoGPT modules
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

print("Model & LoRA ready.")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialoGPT-medium and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,404,224 || all params: 359,306,240 || trainable%: 1.2258
Model & LoRA ready.


In [None]:
print("Original dataset structure:")
for split in ["train", "validation", "test"]:
    print(f"\n{split.upper()}:")
    print(f"Columns: {dataset[split].column_names}")
    if len(dataset[split]) > 0:
        sample = dataset[split][0]
        print(f"Sample: {sample}")
        print(f"Text: '{sample[TEXT_COLUMN]}'")
        print(f"Label: '{sample[LABEL_COLUMN]}' -> ID: {label2id[sample[LABEL_COLUMN]]}")

Original dataset structure:

TRAIN:
Columns: ['text', 'label']
Sample: {'text': 'Will you send me a new card in China?', 'label': 9}
Text: 'Will you send me a new card in China?'
Label: '9' -> ID: 9

VALIDATION:
Columns: ['text', 'label']
Sample: {'text': 'I made a purchase with my card and I was charged a fee for using it', 'label': 15}
Text: 'I made a purchase with my card and I was charged a fee for using it'
Label: '15' -> ID: 15

TEST:
Columns: ['text', 'label']
Sample: {'text': 'How do I locate my card?', 'label': 11}
Text: 'How do I locate my card?'
Label: '11' -> ID: 11


In [None]:
def preprocess(example):
    # Tokenize the text
    tokenized = tokenizer(
        example[TEXT_COLUMN],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    # Add the label
    tokenized["label"] = label2id[example[LABEL_COLUMN]]
    return tokenized

# Apply preprocessing
encoded = dataset.map(preprocess, batched=False)

# Remove the original columns we don't need anymore
columns_to_remove = [col for col in dataset["train"].column_names if col not in ["input_ids", "attention_mask", "label"]]
encoded = encoded.remove_columns(columns_to_remove)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

# For throughput estimate (we use fixed MAX_LEN)
n_train = len(encoded["train"])
approx_train_tokens = n_train * MAX_LEN
print(f"Train examples: {n_train} | ~tokens/epoch: {approx_train_tokens:,}")

# Verify the dataset structure
print(f"\nDataset columns after preprocessing: {encoded['train'].column_names}")
if "label" in encoded["train"].column_names:
    print(f"Sample labels: {encoded['train']['label'][:5]}")
else:
    print("ERROR: Label column not found!")

Map:   0%|          | 0/9002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Train examples: 9002 | ~tokens/epoch: 1,152,256

Dataset columns after preprocessing: ['label', 'input_ids', 'attention_mask']
Sample labels: [9, 71, 18, 21, 45]


In [None]:
print("Dataset structure after preprocessing:")
print(f"Train columns: {encoded['train'].column_names}")
print(f"Validation columns: {encoded['validation'].column_names}")
print(f"Test columns: {encoded['test'].column_names}")

# Check if label column exists
if "label" in encoded["train"].column_names:
    print(f"Label column found! Sample labels: {encoded['train']['label'][:5]}")
else:
    print("ERROR: Label column not found in processed dataset!")
    print("Available columns:", encoded["train"].column_names)

Dataset structure after preprocessing:
Train columns: ['input_ids', 'attention_mask']
Validation columns: ['input_ids', 'attention_mask']
Test columns: ['input_ids', 'attention_mask']
ERROR: Label column not found in processed dataset!
Available columns: ['input_ids', 'attention_mask']


In [None]:
from transformers.trainer_callback import TrainerCallback

class SpeedMemCallback(TrainerCallback):
    def __init__(self, total_tokens_per_epoch):
        self.total_tokens_per_epoch = total_tokens_per_epoch
        self.epoch_start = None

    def on_train_begin(self, args, state, control, **kwargs):
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
        self.wall_start = time.time()

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start = time.time()
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start
        tokens_sec = self.total_tokens_per_epoch / max(epoch_time, 1e-9)
        mem_gib = 0.0
        if torch.cuda.is_available():
            mem_gib = torch.cuda.max_memory_allocated() / (1024**3)

        wandb.log({
            "epoch_time_sec": epoch_time,
            "tokens_per_sec_est": tokens_sec,
            "gpu_max_mem_allocated_GiB": mem_gib,
            "epoch": state.epoch
        })

    def on_train_end(self, args, state, control, **kwargs):
        total_time = time.time() - self.wall_start
        wandb.log({"total_train_time_sec": total_time})

speed_mem_cb = SpeedMemCallback(total_tokens_per_epoch=approx_train_tokens)


In [None]:
print("Testing model with manual input...")

# Create simple test inputs manually
test_text = "This is a test sentence for banking."
test_label = list(label2id.values())[0]  # Use first label ID

# Tokenize manually
test_inputs = tokenizer(
    test_text,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
    return_tensors="pt"
).to(model.device)

test_labels = torch.tensor([test_label]).to(model.device)

print(f"Input shapes: {[v.shape for v in test_inputs.values()]}")
print(f"Label: {test_labels.shape}")

# Test forward pass
model.train()
try:
    outputs = model(**test_inputs, labels=test_labels)
    print(f"✓ Forward pass successful!")
    print(f"Output keys: {list(outputs.keys())}")

    if hasattr(outputs, 'loss'):
        print(f"Loss: {outputs.loss.item()}")
    elif 'loss' in outputs:
        print(f"Loss: {outputs['loss'].item()}")
    else:
        print("No loss found in outputs")

except Exception as e:
    print(f"✗ Forward pass failed: {e}")

    # Try without labels
    try:
        outputs = model(**test_inputs)
        print(f"Output without labels: {list(outputs.keys())}")
    except Exception as e2:
        print(f"Also failed without labels: {e2}")

Testing model with manual input...
Input shapes: [torch.Size([1, 128]), torch.Size([1, 128])]
Label: torch.Size([1])
✓ Forward pass successful!
Output keys: ['loss', 'logits']
Loss: 4.155369758605957


In [None]:
def objective(trial):
    # Suggest hyperparameters
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    warmup_steps = trial.suggest_int("warmup_steps", 100, 500, step=50)

    # Calculate steps per epoch
    steps_per_epoch = len(encoded["train"]) // batch_size
    total_steps = steps_per_epoch * num_epochs

    training_args = TrainingArguments(
        output_dir=f"./results-trial-{trial.number}",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        logging_steps=50,
        eval_steps=100,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        report_to="wandb",
        run_name=f"trial_{trial.number}_{RUN_TAG}",
        remove_unused_columns=False,  # Important for PEFT models
        dataloader_pin_memory=False,
        gradient_accumulation_steps=1,
        fp16=True,
        optim="paged_adamw_8bit",  # Use 8-bit optimizer for better stability
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded["train"],
        eval_dataset=encoded["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Add this to ensure the model returns loss
    trainer.args.remove_unused_columns = False

    try:
        # Train and evaluate
        train_result = trainer.train()
        eval_result = trainer.evaluate(encoded["validation"])

        # Log metrics
        trial.set_user_attr("accuracy", eval_result["eval_accuracy"])
        trial.set_user_attr("f1", eval_result["eval_f1"])

        return eval_result["eval_f1"]

    except Exception as e:
        print(f"Trial {trial.number} failed: {e}")
        # Return a very low score for failed trials
        return 0.0

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS)

print("Best params:", study.best_trial.params)
print("Best val accuracy:", round(study.best_trial.value, 4))


[I 2025-08-25 10:44:17,841] A new study created in memory with name: no-name-bca8878a-a8b1-4db6-a1e5-4f89602d12e9
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1
100,4.4066,4.37134,0.014985,0.008283
200,4.2882,4.244632,0.037962,0.022484
300,4.0692,3.859094,0.141858,0.101327
400,3.2863,2.842949,0.340659,0.271471
500,2.5403,1.952445,0.527473,0.498846
600,1.9591,1.451822,0.619381,0.58908
700,1.5086,1.218968,0.683317,0.664305
800,1.2941,1.011753,0.727273,0.715123
900,1.0437,0.901737,0.758242,0.744955
1000,0.9055,0.812386,0.757243,0.743973


In [None]:
best_params = study.best_trial.params

final_model = GPT2ForSequenceClassification.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    pad_token_id=tokenizer.eos_token_id,
)
final_model = prepare_model_for_kbit_training(final_model)
final_model = get_peft_model(final_model, lora_config)

run = wandb.init(
    project=PROJECT,
    name=f"final_best_{RUN_TAG}",
    reinit=True,
    config={
        **best_params,
        "model": MODEL_NAME,
        "method": "QLoRA",
        "quantization_4bit": True,
        "bnb_4bit_quant_type": QUANT_TYPE,
        "bnb_double_quant": True,
        "bnb_compute_dtype": "float16",
        "phase": "final_training",
        "max_length": MAX_LEN,
    },
    settings=wandb.Settings(start_method="thread")
)

final_args = TrainingArguments(
    output_dir="./final_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
    load_best_model_at_end=True,
    logging_dir="./logs_final",
    logging_steps=50,
    gradient_checkpointing=True,
    fp16=True,
    report_to=["wandb"],
)

final_trainer = Trainer(
    model=final_model,
    args=final_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[speed_mem_cb],
)

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

final_trainer.train()


In [None]:
# 🧪 Cell 11 — Test evaluation + logging
test_results = final_trainer.evaluate(encoded["test"])
print("Test results:", test_results)

wandb.log({
    "test_accuracy": test_results.get("eval_accuracy", None),
    "test_f1": test_results.get("eval_f1", None),
    "test_loss": test_results.get("eval_loss", None),
    "best_val_accuracy": study.best_trial.value
})
wandb.finish()


In [None]:
# 💾 Cell 12 — Save adapter + tokenizer
save_dir = "./final_qlora_dialogpt_nf4" if QUANT_TYPE=="nf4" else "./final_qlora_dialogpt_int4"
final_trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)


In [None]:
# 📈 Cell 13 — Optional: Compare with previous day’s metrics (edit these numbers)
prev = {
    "accuracy": 0.93,
    "f1": 0.93,
    "tokens_per_sec_est": 90000,
    "gpu_max_mem_allocated_GiB": 7.2,
}
curr = {
    "accuracy": test_results.get("eval_accuracy", float("nan")),
    "f1": test_results.get("eval_f1", float("nan")),
    # grab last logged values from W&B? We don’t have API here, so you can re-run with your numbers:
    # for demo we log nan; you can copy from your W&B run summary.
    "tokens_per_sec_est": float("nan"),
    "gpu_max_mem_allocated_GiB": float("nan"),
}

def pct(a, b):
    if (a is None) or (b is None) or math.isnan(a) or math.isnan(b) or b==0: return float("nan")
    return (a - b) / b * 100.0

print("Δ accuracy (%):", pct(curr["accuracy"], prev["accuracy"]))
print("Δ f1 (%):", pct(curr["f1"], prev["f1"]))
print("Δ tokens/sec (%):", pct(curr["tokens_per_sec_est"], prev["tokens_per_sec_est"]))
print("Δ max GPU mem (%):", pct(curr["gpu_max_mem_allocated_GiB"], prev["gpu_max_mem_allocated_GiB"]))
