# Fine-Tuning Llama 2 7B with LoRA on Multiple Benchmarks

## Overview
This notebook fine-tunes Llama 2 7B Chat using LoRA (Low-Rank Adaptation) on four benchmarks:
- **GSM8K**: Grade school math problems
- **MMLU**: Massive Multitask Language Understanding
- **TruthfulQA**: Truthfulness evaluation
- **HellaSwag**: Commonsense reasoning

We train 5 separate adapters
1. Four single-task adapters (one per benchmark)
2. One multi-task adapter (trained on all benchmarks combined)




Install required packages and verify CUDA availability


In [None]:

# %pip install -q --upgrade torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
# %pip install -q transformers==4.43.3 accelerate==0.34.2 peft==0.12.0 datasets==2.19.1 sentence-transformers==2.7.0 bitsandbytes==0.43.3

import torch, transformers, bitsandbytes
print("CUDA:", torch.cuda.is_available(), "| Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "cpu")
print("torch:", torch.__version__, "| transformers:", transformers.__version__, "| bitsandbytes:", bitsandbytes.__version__)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.9/798.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m113.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

Imports, mounts Drive, sets seed, and defines paths.

- Update the paths below to your own Google Drive structure, or use Hugging Face model hub by changing `MODEL_PATH` to a HF model ID

In [None]:
import os, json, re, random, math, torch, numpy as np
from tqdm import tqdm
from google.colab import drive

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

drive.mount("/content/drive")

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# UPDATE THESE PATHS FOR YOUR ENVIRONMENT:
# - Use Google Drive paths OR Hugging Face model IDs
# - MODEL_PATH can be local path or HF model ID (e.g., "meta-llama/Llama-2-7b-chat-hf")

MODEL_PATH = "/content/drive/MyDrive/project/baselinemodel"  # Or use HF: "meta-llama/Llama-2-7b-chat-hf"
TRAIN_ROOT = "/content/drive/MyDrive/project/benchmark_data"
OUTPUT_ROOT = "/content/drive/MyDrive/project/llama_finetuning_runs_v1"
os.makedirs(OUTPUT_ROOT, exist_ok=True)


TRAIN_FILES = {
    "gsm8k":      f"{TRAIN_ROOT}/gsm8k_train.jsonl",
    "mmlu":       f"{TRAIN_ROOT}/mmlu_train.jsonl",
    "truthfulqa": f"{TRAIN_ROOT}/truthfulqa_train.jsonl",
    "hellaswag":  f"{TRAIN_ROOT}/hellaswag_train.jsonl",
}

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

def read_jsonl(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

Mounted at /content/drive
Device: cuda


Formatting functions  
- Defines formatters for each benchmark to convert raw data into training format.

In [None]:
def format_gsm8k(example):
    return {
        "text": f"Question: {example['question']}\nAnswer: {example['answer']}",
        "input": example['question'],
        "output": example['answer']
    }

def format_mmlu(example):
    choices = example['choices']
    choice_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
    answer_letter = chr(65 + int(example['answer']))
    return {
        "text": f"Question: {example['question']}\nChoices:\n{choice_text}\nAnswer: {answer_letter}",
        "input": f"{example['question']}\nChoices:\n{choice_text}",
        "output": answer_letter
    }

def format_truthfulqa(example):
    choices = example['mc1_targets']['choices']
    labels = example['mc1_targets']['labels']
    choice_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
    correct_idx = labels.index(1)
    answer_letter = chr(65 + correct_idx)
    return {
        "text": f"Question: {example['question']}\nChoices:\n{choice_text}\nAnswer: {answer_letter}",
        "input": f"{example['question']}\nChoices:\n{choice_text}",
        "output": answer_letter
    }

def format_hellaswag(example):
    choices = example['endings']
    choice_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])
    answer_letter = chr(65 + int(example['label']))
    return {
        "text": f"Context: {example['ctx']}\nChoices:\n{choice_text}\nAnswer: {answer_letter}",
        "input": f"{example['ctx']}\nChoices:\n{choice_text}",
        "output": answer_letter
    }

FORMATTERS = {
    "gsm8k": format_gsm8k,
    "mmlu": format_mmlu,
    "truthfulqa": format_truthfulqa,
    "hellaswag": format_hellaswag,
}

Model + tokenizer loader (4-bit + LoRA)
- Loads model with 4-bit quantization and applies LoRA configuration.

In [None]:
def load_model_tokenizer_lora(model_path: str):
    tok = AutoTokenizer.from_pretrained(model_path, use_fast=False)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    assert torch.cuda.is_available(), "CUDA GPU required for 4-bit."
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_cfg,
        device_map="auto",
        trust_remote_code=True,
    )
    model = prepare_model_for_kbit_training(model)

    lora_cfg = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    )
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()
    return model, tok

Load and format training data
- Loads and formats all training datasets. Creates combined dataset for multi-task training.

In [None]:
def load_and_format_training_data(benchmark: str, train_file: str):
    raw = read_jsonl(train_file)
    idxs = list(range(len(raw)))
    random.shuffle(idxs)
    formatted = [FORMATTERS[benchmark](raw[i]) for i in idxs]
    return Dataset.from_list(formatted)

print("Loading all training datasets...")
training_datasets = {}
for bench, train_file in TRAIN_FILES.items():
    print(f"Loading {bench}...")
    training_datasets[bench] = load_and_format_training_data(bench, train_file)
    print(f"  Loaded {len(training_datasets[bench])} examples")

# Combined dataset for multi-task fine-tuning
print("\nCombining all datasets for multi-task training...")
combined_dataset = concatenate_datasets(list(training_datasets.values()))
print(f"Total combined examples: {len(combined_dataset)}")

Loading all training datasets...
Loading gsm8k...
  Loaded 1000 examples
Loading mmlu...
  Loaded 1000 examples
Loading truthfulqa...
  Loaded 600 examples
Loading hellaswag...
  Loaded 1000 examples

Combining all datasets for multi-task training...
Total combined examples: 3600


Tokenization
- Tokenizes dataset and prepares for model input

In [None]:

def tokenize_dataset(ds: Dataset, tokenizer, max_length: int = 512):
    """Tokenize dataset and return only input_ids and attention_mask"""
    def _tok(batch):
        tokenized = tokenizer(
            batch["text"],
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors=None  # Returns lists, not tensors
        )
        # Return only the fields needed for training
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"]
        }
    # Remove all original columns to avoid string field issues
    return ds.map(_tok, batched=True, remove_columns=ds.column_names)

Train single benchmark adapter
- Trains a single LoRA adapter for a given benchmark and saves results.

In [None]:
def train_single_adapter(benchmark: str, ds: Dataset, model, tokenizer):
    out_dir = os.path.join(OUTPUT_ROOT, benchmark)
    os.makedirs(out_dir, exist_ok=True)

    tok_ds = tokenize_dataset(ds, tokenizer)
    train_size = int(0.9 * len(tok_ds))
    train_ds = tok_ds.select(range(train_size))
    val_ds = tok_ds.select(range(train_size, len(tok_ds)))

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    args = TrainingArguments(
        output_dir=out_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"{out_dir}/logs",
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_steps=200,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=True,
        lr_scheduler_type="cosine",
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to="none",
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    print(f"\n{'='*60}")
    print(f"Training {benchmark} adapter")
    print(f"{'='*60}")
    trainer.train()

    adapters_dir = f"{out_dir}/lora_adapters"
    os.makedirs(adapters_dir, exist_ok=True)
    model.save_pretrained(adapters_dir)
    tokenizer.save_pretrained(out_dir)

    with open(f"{out_dir}/training_meta.json", "w") as f:
        json.dump({
            "benchmark": benchmark,
            "train_examples": len(train_ds),
            "val_examples": len(val_ds),
            "seed": SEED,
            "model_path": MODEL_PATH,
        }, f, indent=2)

    print(f"{benchmark} adapter saved to: {adapters_dir}")
    return adapters_dir

Train all single-task adapters
- Trains 4 separate LoRA adapters (one per benchmark)

In [None]:
print("\nTraining single-task adapters...")

for bench, ds in training_datasets.items():
    model, tok = load_model_tokenizer_lora(MODEL_PATH)
    train_single_adapter(bench, ds, model, tok)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.



Training single-task adapters...




trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



Training gsm8k adapter


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


gsm8k adapter saved to: /content/drive/MyDrive/project/llama_finetuning_runs_v1/gsm8k/lora_adapters




trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training mmlu adapter


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


mmlu adapter saved to: /content/drive/MyDrive/project/llama_finetuning_runs_v1/mmlu/lora_adapters




trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training truthfulqa adapter


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


truthfulqa adapter saved to: /content/drive/MyDrive/project/llama_finetuning_runs_v1/truthfulqa/lora_adapters




trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training hellaswag adapter


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


hellaswag adapter saved to: /content/drive/MyDrive/project/llama_finetuning_runs_v1/hellaswag/lora_adapters


Train multi-task adapter on all 4k examples
- Trains one LoRA adapter on all 4 benchmarks combined.

In [None]:
print("\nTraining multi-task adapter (all 4k examples)...")

model, tok = load_model_tokenizer_lora(MODEL_PATH)

out_dir = os.path.join(OUTPUT_ROOT, "all_benchmarks")
os.makedirs(out_dir, exist_ok=True)

tok_ds = tokenize_dataset(combined_dataset, tok)
train_size = int(0.9 * len(tok_ds))
train_ds = tok_ds.select(range(train_size))
val_ds = tok_ds.select(range(train_size, len(tok_ds)))

data_collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)

args = TrainingArguments(
    output_dir=out_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir=f"{out_dir}/logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    lr_scheduler_type="cosine",
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tok,
)

print(f"\n{'='*60}")
print("Training multi-task adapter (gsm8k + mmlu + truthfulqa + hellaswag)")
print(f"{'='*60}")
trainer.train()

adapters_dir = f"{out_dir}/lora_adapters"
os.makedirs(adapters_dir, exist_ok=True)
model.save_pretrained(adapters_dir)
tok.save_pretrained(out_dir)

with open(f"{out_dir}/training_meta.json", "w") as f:
    json.dump({
        "benchmark": "all_benchmarks",
        "train_examples": len(train_ds),
        "val_examples": len(val_ds),
        "seed": SEED,
        "model_path": MODEL_PATH,
        "datasets": list(training_datasets.keys()),
    }, f, indent=2)

print(f"\n Multi-task adapter saved to: {adapters_dir}")
print("\nAll training completed!!!!!!")
print(f"Results: {OUTPUT_ROOT}")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.



Training multi-task adapter (all 4k examples)...




trainable params: 39,976,960 || all params: 6,778,392,576 || trainable%: 0.5898


Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training multi-task adapter (gsm8k + mmlu + truthfulqa + hellaswag)


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
500,0.9617,1.935667


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]



 Multi-task adapter saved to: /content/drive/MyDrive/project/llama_finetuning_runs_v1/all_benchmarks/lora_adapters

All training completed!!!!!!
Results: /content/drive/MyDrive/project/llama_finetuning_runs_v1
