
# Fine‑Tuning Generic Pipeline (LoRA / QLoRA) — Colab

This notebook:
1) Installs dependencies  
2) Uploads your zipped project to `/content/project`  
3) Reads YAML configs  
4) Loads model (Kaggle Models or Hugging Face)  
5) Validates dataset  
6) Fine‑tunes with **LoRA/QLoRA**  
7) Exports adapters + metadata  

> **Tip:** If you see organization Drive policy errors, use a personal Gmail or run this on **Kaggle Notebooks**.


In [1]:

%%bash
pip -q install kagglehub transformers peft bitsandbytes accelerate datasets pyyaml trl


   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.7/60.7 MB 15.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 540.5/540.5 kB 20.1 MB/s eta 0:00:00


In [3]:
# =========================
# PROJECT FETCHER (Git mode only; no OAuth, no Drive)
# =========================
GIT_URL    = "https://github.com/Srienath2205/finetuningpoc.git"   # <-- your public repo
GIT_BRANCH = "main"      # change if you use another branch name
GIT_SUBDIR = ""          # if your project is nested in a subfolder, set e.g. "PROJECT_ROOT/"

import os, shutil

TARGET = "/content/project"
!rm -rf "$TARGET"
if GIT_BRANCH:
    !git clone --depth 1 --branch "$GIT_BRANCH" "$GIT_URL" "$TARGET"
else:
    !git clone --depth 1 "$GIT_URL" "$TARGET"

# If repo content lives in a subfolder, move it up into /content/project
if GIT_SUBDIR:
    src = os.path.join(TARGET, GIT_SUBDIR)
    assert os.path.isdir(src), f"Subdir '{GIT_SUBDIR}' not found in cloned repo"
    for name in os.listdir(src):
        shutil.move(os.path.join(src, name), os.path.join(TARGET, name))
    shutil.rmtree(src, ignore_errors=True)

# Basic structure checks
must_dirs = ["configs", "data", "scripts"]
missing = [d for d in must_dirs if not os.path.isdir(os.path.join(TARGET, d))]
if missing:
    raise RuntimeError(f"Missing required folders in {TARGET}: {missing}")

print("[OK] Project ready at:", TARGET)
print("Contents:", os.listdir(TARGET))


Cloning into '/content/project'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 22 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (22/22), 12.88 KiB | 6.44 MiB/s, done.
[OK] Project ready at: /content/project
Contents: ['docs', 'README.md', 'notebooks', 'configs', '.git', 'data', 'requirements.txt', 'scripts']


In [4]:

import os, yaml, json, sys

PROJ = "/content/project"
sys.path.append(os.path.join(PROJ, 'scripts'))

cfg_model   = yaml.safe_load(open(os.path.join(PROJ, 'configs/model_config.yaml')))
cfg_train   = yaml.safe_load(open(os.path.join(PROJ, 'configs/training_config.yaml')))
cfg_data    = yaml.safe_load(open(os.path.join(PROJ, 'configs/dataset_config.yaml')))
cfg_usecase = yaml.safe_load(open(os.path.join(PROJ, 'configs/usecase_config.yaml')))

cfg = {
  'model': cfg_model,
  'train': cfg_train,
  'data': cfg_data,
  'usecase': cfg_usecase
}

print(json.dumps(cfg, indent=2))


{
  "model": {
    "model_source": "kaggle",
    "model_name": "google/gemma-3/pyTorch/gemma-3-1b-it",
    "quantization": "4bit",
    "device_map": "auto",
    "torch_dtype": "bfloat16"
  },
  "train": null,
  "data": {
    "train_path": "/content/data/train.jsonl",
    "eval_path": "/content/data/eval.jsonl",
    "schema_path": "/content/data/schema.json",
    "max_train_records": null,
    "max_eval_records": null,
    "add_system_prompt": false,
    "system_prompt": ""
  },
  "usecase": {
    "usecase_name": "generic_usecase",
    "domain": "generic",
    "output_format": "free_text",
    "metrics": {
      "schema_fidelity": false,
      "exact_match": false,
      "rouge": false,
      "bleu": false
    }
  }
}


In [5]:

import os
try:
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME') or os.environ.get('KAGGLE_USERNAME','')
    os.environ['KAGGLE_KEY']      = userdata.get('KAGGLE_KEY') or os.environ.get('KAGGLE_KEY','')
    print("[OK] Kaggle credentials loaded (if provided)")
except Exception as e:
    print("Colab userdata API not available — if using Kaggle Notebooks, you can ignore this.")


[OK] Kaggle credentials loaded (if provided)


In [7]:
import sys, os
proj_scripts = "/content/project/scripts"
if proj_scripts not in sys.path:
    sys.path.insert(0, proj_scripts)

# quick sanity check
print("[i] sys.path head:", sys.path[:3])
print("[i] scripts exists:", os.path.isdir(proj_scripts))
print("[i] prepare_dataset present:", os.path.isfile(os.path.join(proj_scripts, "prepare_dataset.py")))

[i] sys.path head: ['/content', '/env/python', '/usr/lib/python312.zip']
[i] scripts exists: True
[i] prepare_dataset present: True


In [8]:
from prepare_dataset import validate_or_raise
from pathlib import Path
import json, os

data_dir = "/content/project/data"
train_path = os.path.join(
    data_dir,
    Path(cfg["data"]["train_path"]).name
) if not cfg["data"]["train_path"].startswith("/content") else cfg["data"]["train_path"]

eval_path = os.path.join(
    data_dir,
    Path(cfg["data"]["eval_path"]).name
) if not cfg["data"]["eval_path"].startswith("/content") else cfg["data"]["eval_path"]

print("[i] Train path:", train_path)
print("[i] Eval  path:", eval_path)

validate_or_raise(train_path)
validate_or_raise(eval_path)
print("[OK] Dataset validated")

# Save back into cfg
cfg["data"]["_resolved_train"] = train_path
cfg["data"]["_resolved_eval"]  = eval_path

ImportError: cannot import name 'validate_or_raise' from 'prepare_dataset' (/content/project/scripts/prepare_dataset.py)

In [None]:

from load_model_generic import load_model_and_tokenizer

tok, model, model_path = load_model_and_tokenizer(
    model_source=cfg['model']['model_source'],
    model_name=cfg['model']['model_name'],
    quantization=cfg['model'].get('quantization','4bit'),
    device_map='auto',
    torch_dtype=cfg['model'].get('torch_dtype','bfloat16')
)
print('[OK] Model loaded from:', model_path)


In [None]:
from datasets import load_dataset

# Load the JSONL files resolved earlier in cfg['data']['_resolved_*']
ds = load_dataset(
    "json",
    data_files={
        "train": cfg["data"]["_resolved_train"],
        "eval":  cfg["data"]["_resolved_eval"],
    }
)

def format_example(rec):
    """
    Turn a single record with messages -> a plain supervised example.
    Expected input format per line:
      {
        "messages": [
          {"role": "user", "content": "..."},
          {"role": "assistant", "content": "..."}
        ]
      }
    """
    msgs = rec["messages"]
    user = next((m["content"] for m in msgs if m["role"] == "user"), "")
    assistant = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
    return f"<user>\n{user}\n</user>\n<assistant>\n{assistant}\n</assistant>"

def formatting_func(batch):
    return [format_example(r) for r in batch]

print("[OK] Formatter ready")

In [None]:

from trl import SFTTrainer
from transformers import TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir='/content/output',
    num_train_epochs=cfg['train'].get('epochs', 1),
    per_device_train_batch_size=cfg['train'].get('batch_size', 2),
    per_device_eval_batch_size=cfg['train'].get('batch_size', 2),
    learning_rate=cfg['train'].get('lr', 2e-4),
    logging_steps=20,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    fp16=torch.cuda.is_available(),
    bf16=torch.cuda.is_available(),
    report_to=[]
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tok,
    train_dataset=ds['train'],
    eval_dataset=ds['eval'],
    formatting_func=formatting_func,
    max_seq_length=cfg['train'].get('max_seq_length', 1024),
    args=training_args,
)

trainer.train()
print('[OK] Training complete')


In [None]:

from export_adapters import export_adapters

export_adapters(
    model=model,
    usecase_name=cfg['usecase']['usecase_name'],
    extra_meta={
        'model_source': cfg['model']['model_source'],
        'model_name': cfg['model']['model_name'],
        'method': cfg['train']['method'],
    },
    base_dir='/content/adapters'
)


In [None]:

import json, random

sample = next(iter(ds['eval']))
prompt = sample['messages'][0]['content']

inputs = tok(prompt, return_tensors='pt').to(model.device)
out = model.generate(**inputs, max_new_tokens=200)
print(tok.decode(out[0], skip_special_tokens=True))
