# Fine‑Tuning Generic Pipeline (LoRA / QLoRA) — Colab

This notebook:
1) Installs dependencies  
2) Fetches your project repo to `/content/project`  
3) Reads YAML configs  
4) Loads model (Kaggle Models or Hugging Face)  
5) Validates dataset  
6) Fine‑tunes with **LoRA/QLoRA**  
7) Exports adapters + metadata  

> **Tip:** If you see organization Drive policy errors, use a personal Gmail or run this on **Kaggle Notebooks**.


In [10]:
%%bash
pip install -q transformers datasets peft accelerate bitsandbytes pyyaml kagglehub jsonschema


In [11]:

# =========================
# PROJECT FETCHER (Git mode only; no OAuth, no Drive)
# =========================
GIT_URL    = "https://github.com/Srienath2205/genericfinetuningpipeline.git"  # <-- set this
GIT_BRANCH = "main"      # change if you use another branch
GIT_SUBDIR = ""          # optional: if your project is NOT at repo root, set e.g. "generic-pipeline/"

import os, shutil

TARGET = "/content/project"

# Clean target then clone
os.system(f"rm -rf {TARGET}")
if GIT_BRANCH:
    os.system(f"git clone --depth 1 --branch {GIT_BRANCH} {GIT_URL} {TARGET}")
else:
    os.system(f"git clone --depth 1 {GIT_URL} {TARGET}")

# If repo content is inside a subfolder, move it up into /content/project
if GIT_SUBDIR:
    src = os.path.join(TARGET, GIT_SUBDIR)
    assert os.path.isdir(src), f"Subdir '{GIT_SUBDIR}' not found in cloned repo"
    for name in os.listdir(src):
        shutil.move(os.path.join(src, name), os.path.join(TARGET, name))
    shutil.rmtree(os.path.join(TARGET, GIT_SUBDIR), ignore_errors=True)

# basic structure checks
must_dirs = ["configs", "data", "scripts"]
missing = [d for d in must_dirs if not os.path.isdir(os.path.join(TARGET, d))]
if missing:
    raise RuntimeError(f"Missing required folders in {TARGET}: {missing}")

print("[OK] Project ready at:", TARGET)
print("Contents:", os.listdir(TARGET))


[OK] Project ready at: /content/project
Contents: ['configs', 'data', 'requirements.txt', 'notebooks', '.git', 'docs', 'README.md', 'scripts']


In [12]:
# --- Self-contained dataset load + format cell ---

import os, sys, json
import yaml
from datasets import load_dataset

# --- Ensure BASE_PATH and sys.path for `scripts` imports ---
if "BASE_PATH" not in globals():
    BASE_PATH = "/content/project" if os.path.exists("/content/project") else "/content"
if BASE_PATH not in sys.path:
    sys.path.insert(0, BASE_PATH)

# --- Safe imports of your project utilities ---
from scripts.prepare_dataset import validate_or_raise  # requires sys.path to include BASE_PATH

# --- Helper to load YAML (re-)if needed ---
def load_yaml(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Config not found: {path}")
    with open(path, "r") as f:
        return yaml.safe_load(f)

# --- (Re)load configs if they are not in memory ---
if "data_cfg" not in globals():
    data_cfg = load_yaml(f"{BASE_PATH}/configs/dataset_config.yaml")
if "usecase_cfg" not in globals():
    try:
        usecase_cfg = load_yaml(f"{BASE_PATH}/configs/usecase_config.yaml")
    except FileNotFoundError:
        usecase_cfg = {}

print("Using BASE_PATH:", BASE_PATH)

# --- Path resolver works for both relative and absolute paths ---
def resolve_path(p: str) -> str:
    """Return absolute path under BASE_PATH unless already absolute."""
    return p if os.path.isabs(p) else f"{BASE_PATH}/{p}"

# Resolve dataset paths
train_path = resolve_path(data_cfg["train_path"])
eval_path  = resolve_path(data_cfg["eval_path"])

# Validate files (basic JSONL schema checks)
validate_or_raise(train_path)
validate_or_raise(eval_path)

# Load raw dataset
dataset = load_dataset(
    "json",
    data_files={
        "train": train_path,
        "eval":  eval_path,
    }
)

# Format messages into a single text per example
def format_chat(example):
    text = ""
    for msg in example["messages"]:
        if msg["role"] == "user":
            text += f"### Instruction:\n{msg['content']}\n"
        elif msg["role"] == "assistant":
            text += f"### Response:\n{msg['content']}\n"
    return {"text": text}

dataset = dataset.map(format_chat)
print("Dataset prepared. Columns:", dataset["train"].column_names)


Using BASE_PATH: /content/project
[OK] /content/project/data/train.jsonl validated with 3 records
[OK] /content/project/data/eval.jsonl validated with 1 records


Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Dataset prepared. Columns: ['messages', 'text']


In [14]:
# Load model/tokenizer (with minimal Kaggle pre-check)

import os

# --- Kaggle auth pre-check (only if you're using Kaggle Models) ---
if model_cfg.get("model_source") == "kaggle":
    # Ensure credentials are present in the environment
    if not os.environ.get("KAGGLE_USERNAME") or not os.environ.get("KAGGLE_KEY"):
        raise RuntimeError(
            "Kaggle authentication missing. Set KAGGLE_USERNAME and KAGGLE_KEY in this runtime "
            "before loading Kaggle Models. Example:\n\n"
            "import os\n"
            "os.environ['KAGGLE_USERNAME'] = '<your-kaggle-username>'\n"
            "os.environ['KAGGLE_KEY'] = '<your-kaggle-api-key>'\n\n"
            "Also make sure you've accepted the model's license/terms on its Kaggle Models page."
        )
    # Optional: show kagglehub version to silence the warning
    try:
        import kagglehub
        print("kagglehub version:", getattr(kagglehub, "__version__", "unknown"))
    except Exception:
        pass

# --- Actual load ---
tokenizer, model, model_path = load_model_and_tokenizer(
    model_source=model_cfg["model_source"],
    model_name=model_cfg["model_name"],
    quantization=model_cfg.get("quantization", "4bit"),
    device_map=model_cfg.get("device_map", "auto"),
    torch_dtype=model_cfg.get("torch_dtype", "bfloat16"),
    for_training=(train_cfg["method"] in ["lora", "qlora"]),
)

# Padding setup
tokenizer.pad_token = tokenizer.eos_token
if hasattr(model, "config"):
    model.config.pad_token_id = tokenizer.eos_token_id

# Optional: gradient checkpointing to reduce memory
if hasattr(model, "gradient_checkpointing_enable"):
    try:
        model.gradient_checkpointing_enable()
    except Exception:
        pass

print("Model loaded:", model_path)
``



KaggleApiHTTPError: 403 Client Error.

You don't have permission to access resource at URL: https://www.kaggle.com/models/google/gemma-3/pyTorch/gemma-3-1b-it/1. The server reported the following issues: Permission denied on resource (or it may not exists).
Please make sure you are authenticated if you are trying to access a private resource or a resource requiring consent.

In [None]:

MAX_LEN = train_cfg.get("max_seq_length", 512)

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

tokenized = dataset.map(
    tokenize,
    remove_columns=dataset["train"].column_names,
)
print("Tokenization complete. Example keys:", tokenized["train"].column_names)


In [None]:

if train_cfg["method"] in ["lora", "qlora"]:
    lora_cfg = LoraConfig(
        r=train_cfg["lora"]["r"],
        lora_alpha=train_cfg["lora"]["alpha"],
        target_modules=train_cfg["lora"]["target_modules"],
        lora_dropout=train_cfg["lora"]["dropout"],
        task_type=TaskType.CAUSAL_LM,
    )
    model = get_peft_model(model, lora_cfg)
    print("LoRA/QLoRA adapters attached.")
else:
    print("Full finetuning mode.")


In [None]:

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/output",
    per_device_train_batch_size=train_cfg["batch_size"],
    gradient_accumulation_steps=train_cfg["gradient_accumulation_steps"],
    num_train_epochs=train_cfg["epochs"],
    learning_rate=float(train_cfg["learning_rate"]),
    logging_steps=train_cfg["logging_steps"],
    eval_steps=train_cfg["eval_steps"],
    save_steps=train_cfg["save_steps"],
    save_total_limit=2,
    evaluation_strategy="steps",
    fp16=True,
    report_to="none",
    group_by_length=True,
)

collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()
print("Training complete.")


In [None]:

export_adapters(
    model,
    usecase_name=usecase_cfg["usecase_name"],
    extra_meta={"model": model_cfg["model_name"]},
)
