In [None]:
!pip install -q datasets transformers accelerate peft bitsandbytes evaluate
!huggingface-cli login --token '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'

Step 2: Imports

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate


Step 3: Load dataset

In [None]:
dataset = load_dataset("banking77")

print(dataset)
print("Example sample:", dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})
Example sample: {'text': 'I am still waiting on my card?', 'label': 11}


Step 4: Tokenizer & preprocessing

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"   # requires HF access

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, padding_side="right")

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = dataset.map(preprocess, batched=True)

num_labels = dataset["train"].features["label"].num_classes
print("Number of labels:", num_labels)


Map:   0%|          | 0/10003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Number of labels: 77


Step 5: BitsAndBytes quantization config

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",         # NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16  # A100 supports bf16 natively
)


Step 6: Load base model (4-bit)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    quantization_config=bnb_config,
    device_map="auto"
)

# Fix padding issue
model.config.pad_token_id = tokenizer.pad_token_id


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 7: Pre-fine-tuning evaluation

In [None]:
accuracy = evaluate.load("accuracy")

def eval_model(model, split="test", n_samples=10):
    """Evaluate model accuracy on subset of Banking77"""
    sample = encoded_dataset[split].select(range(n_samples)).to_dict()

    inputs = {
        "input_ids": torch.tensor(sample["input_ids"]).to("cuda"),
        "attention_mask": torch.tensor(sample["attention_mask"]).to("cuda"),
    }
    labels = torch.tensor(sample["label"]).to("cuda")

    with torch.no_grad():
        logits = model(**inputs).logits
        preds = torch.argmax(logits, dim=-1)

    return accuracy.compute(predictions=preds.cpu(), references=labels.cpu())

print("Base model performance (before fine-tuning):")
print(eval_model(model, split="test", n_samples=10))


Base model performance (before fine-tuning):
{'accuracy': 0.0}


Step 8: Configure QLoRA adapters

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=64,
    lora_alpha=128,
    lora_dropout=0.05,
    bias="none"
)

model = get_peft_model(model, lora_config)


Step 9: Training arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./qlora-banking77-llama3",
    eval_strategy="epoch",        # run eval each epoch
    save_strategy="epoch",              # save each epoch
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
    bf16=True,    # A100 → bf16 is stable
    fp16=False,   # avoid fp16 scaling issues
    report_to="none"
)


Step 10: Define Trainer

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


Step 11: Fine-tune with QLoRA

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,397.9169,161.389816,0.830519
2,146.08,100.668022,0.844805
3,46.2917,42.921227,0.919481


TrainOutput(global_step=939, training_loss=565.2787881186975, metrics={'train_runtime': 1341.4592, 'train_samples_per_second': 22.37, 'train_steps_per_second': 0.7, 'total_flos': 1.6150081767682867e+17, 'train_loss': 565.2787881186975, 'epoch': 3.0})

Step 12: Post-training evaluation

In [None]:
print("Fine-tuned model performance:")
print(eval_model(model, split="test", n_samples=100))


Fine-tuned model performance:
{'accuracy': 0.92}


Step 13: Save final model

In [None]:
trainer.save_model("./qlora-banking77-llama3-final")
tokenizer.save_pretrained("./qlora-banking77-llama3-final")


('./qlora-banking77-llama3-final/tokenizer_config.json',
 './qlora-banking77-llama3-final/special_tokens_map.json',
 './qlora-banking77-llama3-final/tokenizer.json')

Step 14: Inference demo

In [None]:
# Reload fine-tuned model correctly
ft_model = AutoModelForSequenceClassification.from_pretrained(
    "./qlora-banking77-llama3-final",
    num_labels=77,
    device_map={"": "cuda"}
)
ft_tokenizer = AutoTokenizer.from_pretrained("./qlora-banking77-llama3-final")

def predict(query):
    # Prepare inputs and send to GPU
    inputs = ft_tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}   # 👈 move all tensors to cuda

    # Run model on GPU
    ft_model.eval()
    with torch.no_grad():
        logits = ft_model(**inputs).logits

    pred = torch.argmax(logits, dim=-1).item()
    return dataset["train"].features["label"].int2str(pred)

print("Predictions:")
print("Q: How do I reset my card PIN?  →", predict("How do I reset my card PIN?"))
print("Q: I lost my debit card, what should I do? →", predict("I lost my debit card, what should I do?"))


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predictions:
Q: How do I reset my card PIN?  → pin_blocked
Q: I lost my debit card, what should I do? → lost_or_stolen_card
