In [None]:
# Setup & Drive Mount

from google.colab import drive
drive.mount("/content/drive")

!pip install -q bitsandbytes peft datasets
!pip install -q -U transformers accelerate
!pip install -q tqdm matplotlib

import torch, random, os
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer,
    BitsAndBytesConfig, get_linear_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, TaskType

seed = 42
random.seed(seed); torch.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)

# Paths
BASE_SFT_CKPT = "/content/drive/MyDrive/llm_project/sft_qlora_pytorch_gpt2_medium"
OUT_DIR = "/content/drive/MyDrive/llm_project/inst_qlora_pytorch_gpt2_medium_dolly"
os.makedirs(OUT_DIR, exist_ok=True)

# Training hyperparams
max_length = 512
batch_size = 4
grad_accum = 1
num_epochs = 1
lr = 2e-4
warmup_ratio = 0.05
clip_grad = 1.0


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hRunning on: cuda


In [None]:
# Load Dolly dataset and preprocess

raw = load_dataset("databricks/databricks-dolly-15k")
split = raw["train"].shuffle(seed=42).train_test_split(test_size=0.1) # shuffle and split

train_full = split["train"]
val_full = split["test"]

train_raw = train_full.select(range(12000))
val_raw   = val_full.select(range(400))

def format_example(ex):
    inst = ex.get("instruction","").strip()
    ctx  = ex.get("context","")
    rsp  = ex.get("response","").strip()

    if ctx and isinstance(ctx,str) and ctx.strip():
        prompt = f"### Instruction:\n{inst}\n\n### Context:\n{ctx.strip()}\n\n### Response:\n"
    else:
        prompt = f"### Instruction:\n{inst}\n\n### Response:\n"

    return prompt, rsp


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [None]:
# Tokenize with loss masking

tok_src = BASE_SFT_CKPT if os.path.exists(BASE_SFT_CKPT) else "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(tok_src)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def preprocess(batch):
    out = {"input_ids": [], "attention_mask": [], "labels": []}
    for i in range(len(batch["instruction"])):
        ex = {
            "instruction": batch["instruction"][i],
            "context": batch.get("context", [""])[i],
            "response": batch["response"][i],
        }

        prompt, rsp = format_example(ex)
        full_text = prompt + rsp + tokenizer.eos_token
        full = tokenizer(full_text, max_length=max_length, truncation=True,
                         padding="max_length")
        prompt_tok = tokenizer(prompt, max_length=max_length, truncation=True)
        prompt_len = min(len(prompt_tok["input_ids"]), max_length)
        labels = full["input_ids"].copy()
        labels[:prompt_len] = [-100] * prompt_len # CrossEntropyLoss ignores any token with label = -100 when computing loss

        out["input_ids"].append(full["input_ids"])
        out["attention_mask"].append(full["attention_mask"])
        out["labels"].append(labels)

    return out

train_proc = train_raw.map(preprocess, batched=True, batch_size=64, remove_columns=train_raw.column_names)
val_proc   = val_raw.map(preprocess,   batched=True, batch_size=64, remove_columns=val_raw.column_names)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [None]:
# Dataloaders

def collate_fn(batch):
    input_ids = torch.tensor([b["input_ids"] for b in batch])
    attn = torch.tensor([b["attention_mask"] for b in batch])
    labels = torch.tensor([b["labels"] for b in batch])
    return {"input_ids": input_ids, "attention_mask": attn, "labels": labels}

train_loader = DataLoader(train_proc, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_proc,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [None]:
# 4️⃣  Load base SFT model (QLoRA)

bnb = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    BASE_SFT_CKPT, quantization_config=bnb, device_map="auto"
)

lora = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["c_attn","c_proj"],
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora)
model.print_trainable_parameters()


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 4,325,376 || all params: 359,148,544 || trainable%: 1.2043




In [None]:
# Optimizer & Scheduler

train_steps_per_epoch = len(train_loader)//grad_accum + 1
total_steps = train_steps_per_epoch * num_epochs
warmup_steps = max(1, int(warmup_ratio * total_steps))

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
print(f"Total steps={total_steps}, warmup={warmup_steps}")


Total steps=3001, warmup=150


In [None]:
# Training loop (AMP + grad accumulation)

# scaler keeps gradients numerically stable in mixed precision.
# Scales up loss before backprop to prevent gradient underflow. (used only when calling .backward())
# Used in training only
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
model.train()
global_step, losses = 0, []

for epoch in range(num_epochs):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    optimizer.zero_grad()

    for step, batch in enumerate(loop):
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        lbl = batch["labels"].to(device)

        # Runs the model in mixed precision
        # Runs operations in mixed precision (bfloat16/float16) to save memory and speed up compute
        # Used in training / evaluation
        with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=torch.cuda.is_available()):
            out = model(input_ids=ids, attention_mask=attn, labels=lbl)
            loss = out.loss / grad_accum

        scaler.scale(loss).backward() # scale the loss before back propagation

        # simulate a larger batch size by
        # accumulating gradients across several mini-steps before updating weights
        if (step+1) % grad_accum == 0:
            scaler.unscale_(optimizer) # unscale gradients back to original scale
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) # Prevents exploding gradients

            # replaces the usual optimizer.step() in mixed-precision training.
            scaler.step(optimizer) # only update if grads finite
            scaler.update() # adjust scaling factor
            optimizer.zero_grad() # reset grads
            scheduler.step() # adjust learning rate

            global_step += 1
            losses.append(loss.item() * grad_accum) # converts per-microbatch loss back to per-step loss.
            loop.set_postfix(loss=f"{loss.item() * grad_accum:.4f}", step=global_step)


  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


Epoch 1/1:   0%|          | 0/3000 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=torch.cuda.is_available()):
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


In [None]:
# Validation (loss + perplexity)

import math

model.eval()
val_loss = 0

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        lbl = batch["labels"].to(device)

        with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=torch.cuda.is_available()):
            out = model(input_ids=ids, attention_mask=attn, labels=lbl)

        val_loss += out.loss.item()

val_loss /= len(val_loader)
ppl = math.exp(val_loss)
print(f"Validation loss={val_loss:.4f}, perplexity={ppl:.2f}")

model.train()


Validating:   0%|          | 0/100 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=torch.cuda.is_available()):


Validation loss=0.4303, perplexity=1.54


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 1024)
        (wpe): Embedding(1024, 1024)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPT2Block(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=3072, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (

In [None]:
# Save model + tokenizer to Drive

model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
print(f"Model saved to {OUT_DIR}")


Model saved to /content/drive/MyDrive/llm_project/inst_qlora_pytorch_gpt2_medium_dolly


In [None]:
# Export core dependencies

import importlib

core_packages = [
    "torch",
    "transformers",
    "datasets",
    "peft",
    "bitsandbytes",
    "accelerate",
    "tqdm",
    "matplotlib"
]

# Collect installed versions
with open("requirements.txt", "w") as f:
    for pkg in core_packages:
        try:
            version = importlib.metadata.version(pkg)
            f.write(f"{pkg}=={version}\n")
        except importlib.metadata.PackageNotFoundError:
            f.write(f"# {pkg} not installed\n")

print("requirements.txt created with core dependencies:")
!cat requirements.txt


requirements.txt created with core dependencies:
torch==2.8.0+cu126
transformers==4.57.1
datasets==4.0.0
peft==0.17.1
bitsandbytes==0.48.1
accelerate==1.11.0
tqdm==4.67.1
matplotlib==3.10.0


In [None]:
# Interactive generation loop

def chat():
    print("\nEnter a prompt below (type 'quit' to exit)\n")
    while True:
        prompt = input("Prompt: ")
        if prompt.strip().lower() in {"quit","exit"}:
            print("Exiting chat.")
            break
        formatted = f"### Instruction:\n{prompt.strip()}\n\n### Response:\n"
        tokens = tokenizer(formatted, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model.generate(
                **tokens,
                max_new_tokens=200,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        print("\n", tokenizer.decode(out[0], skip_special_tokens=True))
        print("-"*80)

chat()



Enter a prompt below (type 'quit' to exit)

Prompt: explain why the earth orbits the sun

 ### Instruction:
explain why the earth orbits the sun

### Response:
The earth orbits the sun because the sun is moving around the Earth. The earth is moving around the sun because of the rotation of the earth.

The earth is rotating around the sun because of the sun's rotation. This rotation creates an "accelerator" that is rotating the earth.

This accelerator is rotating the earth, which causes the earth to spin around the sun. This spinning creates an "accelerator" that is spinning the earth.
--------------------------------------------------------------------------------


KeyboardInterrupt: Interrupted by user