<a href="https://colab.research.google.com/github/Shamitha24/DBDavis/blob/main/training_adapters_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Login
from huggingface_hub import login
login()
print("Logged in – Llama-3.2-3B ready!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Logged in – Llama-3.2-3B ready!


In [4]:
# Install
!pip install -q torch transformers datasets evaluate peft accelerate bitsandbytes sentencepiece rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [5]:
# Imports + tiny fix for memory
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import get_peft_model, IA3Config, TaskType
from datasets import load_dataset
import evaluate, gc, os

# Force garbage collection + empty cache between runs
def free_memory():
    torch.cuda.empty_cache()
    gc.collect()

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [6]:
# Model + Dataset (smaller batches + shorter seq)
MODEL = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

raw = load_dataset("databricks/databricks-dolly-15k")["train"]

def format_example(ex):
    prompt = f"### Instruction:\n{ex['instruction']}\n\n"
    if ex['context'].strip():
        prompt += f"### Context:\n{ex['context']}\n\n"
    prompt += "### Response:\n"
    return {"prompt": prompt, "full": prompt + ex['response']}

raw = raw.map(format_example, remove_columns=raw.column_names)

def tokenize(ex):
    tok = tokenizer(ex["full"], truncation=True, max_length=384)  # ← reduced from 512
    prompt_len = len(tokenizer(ex["prompt"])["input_ids"])
    labels = [-100] * prompt_len + tok["input_ids"][prompt_len:]
    labels = (labels + [-100]*384)[:384]
    tok["labels"] = labels
    return tok

data = raw.map(tokenize, remove_columns=["prompt", "full"])
splits = data.train_test_split(test_size=0.08, seed=42)  # ← smaller val set
train_ds = splits["train"]
eval_ds = splits["test"].select(range(128))              # ← only 128 eval examples

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [7]:
# Metrics (unchanged)
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    pred_ids = preds.argmax(-1)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = [tokenizer.decode([t for t in l if t != -100], skip_special_tokens=True) for l in labels]
    return {
        "rougeL": rouge.compute(predictions=decoded_preds, references=decoded_labels)["rougeL"],
        "bleu": bleu.compute(predictions=decoded_preds, references=[[x] for x in decoded_labels])["bleu"]
    }

In [9]:
# TRAINING LOOP (FINAL FIXED VERSION – NO ERRORS)
configs = [
    {"name": "m16",  "target": ["k_proj", "v_proj"],                     "ff": [],                        "m": 16},
    {"name": "m64",  "target": ["k_proj", "v_proj", "down_proj"],       "ff": ["down_proj"],            "m": 64},
    {"name": "m128", "target": ["k_proj", "v_proj", "down_proj", "gate_proj"], "ff": ["down_proj", "gate_proj"], "m": 128},
]

results = []

for cfg in configs:
    print(f"\n{'='*90}")
    print(f"IA³ ADAPTERS — Approx m = {cfg['m']}  ({cfg['name']})")
    print(f"{'='*90}")
    free_memory()

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True
    )

    ia3_config = IA3Config(
        task_type=TaskType.CAUSAL_LM,
        target_modules=cfg["target"],
        feedforward_modules=cfg["ff"],
    )

    model = get_peft_model(model, ia3_config)
    model.print_trainable_parameters()

    args = TrainingArguments(
        output_dir=f"./ia3_{cfg['name']}",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        learning_rate=5e-4,
        num_train_epochs=1,
        logging_steps=50,
        eval_strategy="epoch",           # ← FIXED: was evaluation_strategy
        save_strategy="no",
        fp16=True,
        report_to="none",
        seed=42,
        dataloader_num_workers=0,
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        compute_metrics=compute_metrics,
    )

    trainer.train()
    metrics = trainer.evaluate()

    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
    results.append({
        "m": cfg["m"],
        "params_M": round(trainable, 2),
        "rougeL": round(metrics["eval_rougeL"], 4),
        "bleu": round(metrics["eval_bleu"], 4),
    })

    print(f"Done → ROUGE-L: {metrics['eval_rougeL']:.4f} | BLEU: {metrics['eval_bleu']:.4f}")
    del model, trainer
    free_memory()


IA³ ADAPTERS — Approx m = 16  (m16)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 57,344 || all params: 3,212,807,168 || trainable%: 0.0018


Epoch,Training Loss,Validation Loss,Rougel,Bleu
1,1.916,2.007926,0.483276,0.214283


Done → ROUGE-L: 0.4833 | BLEU: 0.2143

IA³ ADAPTERS — Approx m = 64  (m64)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 286,720 || all params: 3,213,036,544 || trainable%: 0.0089


Epoch,Training Loss,Validation Loss,Rougel,Bleu
1,1.8442,1.941702,0.506057,0.219803


Done → ROUGE-L: 0.5061 | BLEU: 0.2198

IA³ ADAPTERS — Approx m = 128  (m128)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 372,736 || all params: 3,213,122,560 || trainable%: 0.0116


Epoch,Training Loss,Validation Loss,Rougel,Bleu
1,1.834,1.932663,0.513094,0.224841


Done → ROUGE-L: 0.5131 | BLEU: 0.2248


In [15]:
# Results
print("\n" + "="*90)
print("FINAL TABLE")
print("="*90)
print("| Method       | m   | Trainable Params | ROUGE-L | BLEU  |")
print("|--------------|-----|------------------|------------------|---------|-------|")

for r in results:
    # This line works even if the key is called 'params_M', 'params', 'param_count' etc.
    params_key = next(k for k in r.keys() if 'param' in k.lower() and 'm' in k.lower())
    params_m = r[params_key]
    print(f"| IA³ Adapters | {r['m']:>3} |        {params_m:>5.2f}M | {r['rougeL']:.4f}   | {r['bleu']:.4f} |")

print("="*90)


FINAL TABLE
| Method       | m   | Trainable Params | ROUGE-L | BLEU  |
|--------------|-----|------------------|------------------|---------|-------|
| IA³ Adapters |  16 |         0.06M | 0.4833   | 0.2143 |
| IA³ Adapters |  64 |         0.29M | 0.5061   | 0.2198 |
| IA³ Adapters | 128 |         0.37M | 0.5131   | 0.2248 |
