## Import model, data, tokenizer

In [None]:
# TODO: transoformer may need to be downgraded to 4.38.0, noted this issue

import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Metal GPU on macOS
else:
    device = torch.device("cpu")  # fallback

from huggingface_hub import login

login(token="YOUR_ACCESS_TOKEN")

In [12]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig         # ≥ transformers-4.39, bitsandbytes-0.43
)

ckpt = "distilgpt2"

tok = AutoTokenizer.from_pretrained(ckpt)

model = AutoModelForCausalLM.from_pretrained(
    ckpt,
    device_map="auto",
    torch_dtype="auto",                
)

convert the dataset to text form that can be processed by tokenizer

In [13]:
from datasets import load_dataset

raw_ds = load_dataset("squad_v2")

def to_prompt(example):
    has_answer = len(example["answers"]["text"]) > 0
    answer = example["answers"]["text"][0] if has_answer else "unanswerable"
    prompt = (
        "<s>[INST] You are a QA assistant.\n"
        f"Context: {example['context']}\n"
        f"Question: {example['question']} [/INST] "
        f"{answer} </s>"
    )
    return {"text": prompt}

ds = raw_ds.map(to_prompt, remove_columns=raw_ds["train"].column_names)

## Define finetuning technique

In [14]:
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)

model = prepare_model_for_kbit_training(model)   # freeze & cast norms  [oai_citation:1‡Hugging Face](https://huggingface.co/docs/peft/developer_guides/quantization?utm_source=chatgpt.com)
lora_cfg = LoraConfig(
    r=8,  # lower rank number: 8, 16, 32, ....
    lora_alpha=32,  # output = W(x) + (lora_alpha / r) * B(A(x))  how much the influence does the adapter has
    target_modules=["c_attn", "c_proj"],   # modlues of GPT that are being applied lora for updating
    lora_dropout=0.05,  # Prevents overfitting on small data during fine-tuning.
    bias="none",  # No bias term in lora
    task_type="CAUSAL_LM",  # For GPT style model, others like: "SEQ_2_SEQ_LM" for bert, T5
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()        # sanity-check: only ~0.2 % params train

trainable params: 405,504 || all params: 82,318,080 || trainable%: 0.4926




## Tokenizing the data

In [15]:
from transformers import DataCollatorForLanguageModeling

tok.pad_token = tok.eos_token

def tokenize(example):
    # Tokenize prompt (which includes both question and answer)
    tokens = tok(example["text"], truncation=True, padding="max_length", max_length=128)
    tokens["labels"] = tokens["input_ids"].copy()  # Required for causal LM loss
    return tokens

# .map() is to replace for loop
tokenised = ds.map(tokenize, batched=True, remove_columns=["text"])  # First apply tokenize() transformation to a batch of sample; then remove the original `text`

data_collator = DataCollatorForLanguageModeling(tok, mlm=False)  # Optional but good to keep, if you want dynamic padding 

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Before tokenization transformation

In [16]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 11873
    })
})

After tokenization tranformation

In [17]:
tokenised

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 11873
    })
})

## Defining Metrics Funciton

In [18]:
from transformers import EvalPrediction
from typing import Dict
import numpy as np

def compute_metrics(eval_pred: EvalPrediction) -> Dict[str, float]:
    preds, labels = eval_pred.predictions, eval_pred.label_ids

    # Decode tokens → strings
    pred_texts = tok.batch_decode(preds, skip_special_tokens=True)
    label_texts = tok.batch_decode(labels, skip_special_tokens=True)

    # Example: exact string match (can be replaced with ROUGE/BLEU/etc.)
    matches = [
        int(pred.strip().lower() == label.strip().lower())
        for pred, label in zip(pred_texts, label_texts)
    ]
    accuracy = np.mean(matches)

    return {"exact_match_accuracy": accuracy}

## Defining training arguments and train the model

In [19]:
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    output_dir="distill-gpt2",
    num_train_epochs=1,
    per_device_train_batch_size=2,     # keep small on M-series
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=False,                        # ✅ must be False
    bf16=False,                        # ✅ must be False
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,
    report_to="tensorboard",
    push_to_hub=True,
    hub_private_repo=True,
    **{"label_names": ["labels"]}
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenised["train"].select(range(2000)),
    eval_dataset=tokenised["validation"].select(range(200)),
    compute_metrics = compute_metrics,
    data_collator=data_collator,
)
trainer.train()
trainer.save_model()                    # adapter weights + config
tok.save_pretrained(args.output_dir)

Epoch,Training Loss,Validation Loss


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer