
# Fine‑Tuning Generic Pipeline (LoRA / QLoRA) — Colab

This notebook:
1) Installs dependencies  
2) Uploads your zipped project to `/content/project`  
3) Reads YAML configs  
4) Loads model (Kaggle Models or Hugging Face)  
5) Validates dataset  
6) Fine‑tunes with **LoRA/QLoRA**  
7) Exports adapters + metadata  

> **Tip:** If you see organization Drive policy errors, use a personal Gmail or run this on **Kaggle Notebooks**.


In [None]:

%%bash
pip -q install kagglehub transformers peft bitsandbytes accelerate datasets pyyaml trl


In [None]:
# =========================
# PROJECT FETCHER (Git mode only; no OAuth, no Drive)
# =========================
GIT_URL    = "https://github.com/<your-username>/<your-repo>.git"  # <-- set this
GIT_BRANCH = "main"      # change if you use another branch
GIT_SUBDIR = ""          # optional: if your project is NOT at repo root, set e.g. "generic-pipeline/"

import os, shutil, sys

TARGET = "/content/project"
!rm -rf "$TARGET"
if GIT_BRANCH:
    !git clone --depth 1 --branch "$GIT_BRANCH" "$GIT_URL" "$TARGET"
else:
    !git clone --depth 1 "$GIT_URL" "$TARGET"

# If repo content is inside a subfolder, move it up into /content/project
if GIT_SUBDIR:
    src = os.path.join(TARGET, GIT_SUBDIR)
    assert os.path.isdir(src), f"Subdir '{GIT_SUBDIR}' not found in cloned repo"
    for name in os.listdir(src):
        shutil.move(os.path.join(src, name), os.path.join(TARGET, name))
    # optional: remove leftover subdir tree
    shutil.rmtree(os.path.join(TARGET, GIT_SUBDIR), ignore_errors=True)

# basic structure checks
must_dirs = ["configs", "data", "scripts"]
missing = [d for d in must_dirs if not os.path.isdir(os.path.join(TARGET, d))]
if missing:
    raise RuntimeError(f"Missing required folders in {TARGET}: {missing}")

print("[OK] Project ready at:", TARGET)
print("Contents:", os.listdir(TARGET))

In [None]:

import os, yaml, json, sys

PROJ = "/content/project"
sys.path.append(os.path.join(PROJ, 'scripts'))

cfg_model   = yaml.safe_load(open(os.path.join(PROJ, 'configs/model_config.yaml')))
cfg_train   = yaml.safe_load(open(os.path.join(PROJ, 'configs/training_config.yaml')))
cfg_data    = yaml.safe_load(open(os.path.join(PROJ, 'configs/dataset_config.yaml')))
cfg_usecase = yaml.safe_load(open(os.path.join(PROJ, 'configs/usecase_config.yaml')))

cfg = {
  'model': cfg_model,
  'train': cfg_train,
  'data': cfg_data,
  'usecase': cfg_usecase
}

print(json.dumps(cfg, indent=2))


In [None]:

import os
try:
    from google.colab import userdata
    os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME') or os.environ.get('KAGGLE_USERNAME','')
    os.environ['KAGGLE_KEY']      = userdata.get('KAGGLE_KEY') or os.environ.get('KAGGLE_KEY','')
    print("[OK] Kaggle credentials loaded (if provided)")
except Exception as e:
    print("Colab userdata API not available — if using Kaggle Notebooks, you can ignore this.")


In [None]:

from prepare_dataset import validate_or_raise
from pathlib import Path
import json, os

data_dir = "/content/project/data"
train_path = os.path.join(data_dir, Path(cfg['data']['train_path']).name) if not cfg['data']['train_path'].startswith('/content') else cfg['data']['train_path']
eval_path  = os.path.join(data_dir, Path(cfg['data']['eval_path']).name)  if not cfg['data']['eval_path'].startswith('/content') else cfg['data']['eval_path']

print('[i] Train path:', train_path)
print('[i] Eval  path:', eval_path)

validate_or_raise(train_path)
validate_or_raise(eval_path)
print('[OK] Dataset validated')

# Save back into cfg
cfg['data']['_resolved_train'] = train_path
cfg['data']['_resolved_eval'] = eval_path


In [None]:

from load_model_generic import load_model_and_tokenizer

tok, model, model_path = load_model_and_tokenizer(
    model_source=cfg['model']['model_source'],
    model_name=cfg['model']['model_name'],
    quantization=cfg['model'].get('quantization','4bit'),
    device_map='auto',
    torch_dtype=cfg['model'].get('torch_dtype','bfloat16')
)
print('[OK] Model loaded from:', model_path)


In [None]:
from datasets import load_dataset

# Load the JSONL files resolved earlier in cfg['data']['_resolved_*']
ds = load_dataset(
    "json",
    data_files={
        "train": cfg["data"]["_resolved_train"],
        "eval":  cfg["data"]["_resolved_eval"],
    }
)

def format_example(rec):
    """
    Turn a single record with messages -> a plain supervised example.
    Expected input format per line:
      {
        "messages": [
          {"role": "user", "content": "..."},
          {"role": "assistant", "content": "..."}
        ]
      }
    """
    msgs = rec["messages"]
    user = next((m["content"] for m in msgs if m["role"] == "user"), "")
    assistant = next((m["content"] for m in msgs if m["role"] == "assistant"), "")
    return f"<user>\n{user}\n</user>\n<assistant>\n{assistant}\n</assistant>"

def formatting_func(batch):
    return [format_example(r) for r in batch]

print("[OK] Formatter ready")

In [None]:

from trl import SFTTrainer
from transformers import TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir='/content/output',
    num_train_epochs=cfg['train'].get('epochs', 1),
    per_device_train_batch_size=cfg['train'].get('batch_size', 2),
    per_device_eval_batch_size=cfg['train'].get('batch_size', 2),
    learning_rate=cfg['train'].get('lr', 2e-4),
    logging_steps=20,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    fp16=torch.cuda.is_available(),
    bf16=torch.cuda.is_available(),
    report_to=[]
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tok,
    train_dataset=ds['train'],
    eval_dataset=ds['eval'],
    formatting_func=formatting_func,
    max_seq_length=cfg['train'].get('max_seq_length', 1024),
    args=training_args,
)

trainer.train()
print('[OK] Training complete')


In [None]:

from export_adapters import export_adapters

export_adapters(
    model=model,
    usecase_name=cfg['usecase']['usecase_name'],
    extra_meta={
        'model_source': cfg['model']['model_source'],
        'model_name': cfg['model']['model_name'],
        'method': cfg['train']['method'],
    },
    base_dir='/content/adapters'
)


In [None]:

import json, random

sample = next(iter(ds['eval']))
prompt = sample['messages'][0]['content']

inputs = tok(prompt, return_tensors='pt').to(model.device)
out = model.generate(**inputs, max_new_tokens=200)
print(tok.decode(out[0], skip_special_tokens=True))
