# Phase 3: QLoRA Fine-Tuning â€” Qwen2.5-VL-2B on OpenPack

ðŸ”— **Live Kaggle Notebook:** [PASTE YOUR KAGGLE URL HERE AFTER RUNNING]

Fine-tunes Qwen2.5-VL-2B-Instruct using 4-bit QLoRA on OpenPack packaging operations dataset.

**Target compute:** Kaggle 2Ã—T4 (32 GB) or GCP Vertex AI A100 (40 GB)

In [1]:
import subprocess, torch

result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
print(result.stdout)

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total = props.total_memory / 1e9
        print(f"GPU {i}: {props.name} | {total:.1f} GB")
else:
    print("No GPU detected")

ModuleNotFoundError: No module named 'torch'

In [2]:
!pip install -q transformers==4.41.2 accelerate==0.30.1 peft==0.11.1
!pip install -q bitsandbytes==0.43.1 trl==0.8.6
!pip install -q decord webdataset einops qwen-vl-utils
!pip install -q datasets huggingface-hub openpack-toolkit
print("âœ“ All packages installed")


[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


âœ“ All packages installed



[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# â”€â”€ REQUIRED VRAM Budget Calculation â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

model_base_4bit  = 2.0    # GB â€” Qwen2-VL-2B at 4-bit (2B params Ã— 0.5 bytes)
lora_adapters    = 0.3    # GB â€” LoRA rank=16, targeting q/k/v/o projections
frames_per_clip  = 8      # Frames sampled per 5-second clip
frame_tokens     = 256    # Visual tokens per frame (14Ã—14 patches + merge)
batch_size       = 2
token_hidden_dim = 1536   # Qwen2-VL-2B hidden size (from config.json)

# Raw activation memory
activation_gb = (frames_per_clip * frame_tokens * batch_size * token_hidden_dim * 2) / 1e9

# With gradient checkpointing: 40% stored (rest recomputed on backward pass)
activation_with_gc = activation_gb * 0.4

# Optimizer (AdamW): 2 momentum states per LoRA param
optimizer_gb = lora_adapters * 2

total_vram_gb = model_base_4bit + lora_adapters + activation_with_gc + optimizer_gb

print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"  Model (4-bit):           {model_base_4bit:.2f} GB")
print(f"  LoRA adapters:           {lora_adapters:.2f} GB")
print(f"  Activations (raw):       {activation_gb:.2f} GB")
print(f"  Activations (+GC 0.4Ã—):  {activation_with_gc:.2f} GB")
print(f"  Optimizer states:        {optimizer_gb:.2f} GB")
print(f"  â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"  TOTAL ESTIMATED VRAM:    {total_vram_gb:.2f} GB")
print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"T4  (16 GB): {'âœ“ FITS' if total_vram_gb < 16 else 'âœ— OOM'}")
print(f"2Ã—T4(32 GB): {'âœ“ FITS' if total_vram_gb < 32 else 'âœ— OOM'}")
print(f"A100(40 GB): {'âœ“ FITS' if total_vram_gb < 40 else 'âœ— OOM'}")

assert total_vram_gb < 16.0, f"Estimate {total_vram_gb:.2f} GB exceeds single T4!"
print("\nâœ“ VRAM math passes T4 assertion")

â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  Model (4-bit):           2.00 GB
  LoRA adapters:           0.30 GB
  Activations (raw):       0.01 GB
  Activations (+GC 0.4Ã—):  0.01 GB
  Optimizer states:        0.60 GB
  â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  TOTAL ESTIMATED VRAM:    2.91 GB
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
T4  (16 GB): âœ“ FITS
2Ã—T4(32 GB): âœ“ FITS
A100(40 GB): âœ“ FITS

âœ“ VRAM math passes T4 assertion


In [4]:
from dataclasses import dataclass, field
from pathlib import Path

@dataclass
class Config:
    model_name:   str = "Qwen/Qwen2-VL-2B-Instruct"
    data_root:    str = "/kaggle/input/openpack-dataset"   # adjust for GCP
    output_dir:   str = "/kaggle/working/checkpoints"

    # LoRA
    lora_rank:    int   = 16
    lora_alpha:   int   = 32
    lora_dropout: float = 0.1
    lora_targets: list  = field(default_factory=lambda: ["q_proj","v_proj","k_proj","o_proj"])

    # Training
    epochs:       int   = 3
    batch_size:   int   = 2
    grad_accum:   int   = 8       # effective batch = 16
    lr:           float = 2e-4
    warmup:       float = 0.05
    weight_decay: float = 0.01

    # Memory
    use_4bit:     bool  = True
    grad_ckpt:    bool  = True

    # Checkpointing
    save_steps:   int   = 50
    save_limit:   int   = 3
    eval_steps:   int   = 100
    log_steps:    int   = 10

    # Clip
    frames:       int   = 8
    max_seq_len:  int   = 2048

cfg = Config()
Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)
print(f"Config ready. Output dir: {cfg.output_dir}")

Config ready. Output dir: /kaggle/working/checkpoints


In [5]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# 4-bit quantization
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
) if cfg.use_4bit else None

# Load base model
print(f"Loading {cfg.model_name}...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    cfg.model_name,
    quantization_config=bnb_cfg,
    torch_dtype=torch.float16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(cfg.model_name)
print(f"Loaded. Total params: {sum(p.numel() for p in model.parameters())/1e9:.2f}B")

# Prepare for k-bit training (REQUIRED before LoRA)
model = prepare_model_for_kbit_training(model)

# Apply LoRA
lora_cfg = LoraConfig(
    r=cfg.lora_rank,
    lora_alpha=cfg.lora_alpha,
    target_modules=cfg.lora_targets,
    lora_dropout=cfg.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# Enable gradient checkpointing â€” all 3 flags required together
if cfg.grad_ckpt:
    model.gradient_checkpointing_enable()    # Flag 1
    model.enable_input_require_grads()       # Flag 2 (needed with PEFT)
    print("âœ“ Gradient checkpointing enabled")

# Show actual VRAM after load
if torch.cuda.is_available():
    alloc = torch.cuda.memory_allocated() / 1e9
    resrv = torch.cuda.memory_reserved()  / 1e9
    print(f"\nActual VRAM â€” Allocated: {alloc:.2f} GB | Reserved: {resrv:.2f} GB")

ImportError: cannot import name 'Qwen2VLForConditionalGeneration' from 'transformers' (C:\Users\Satyam kumar\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\__init__.py)

In [None]:
import sys
sys.path.insert(0, "/kaggle/working")   # so data_pipeline.py can be imported

from pathlib import Path
from data_pipeline import build_hf_dataset, TRAIN_SUBJECTS, VAL_SUBJECTS

data_root   = Path(cfg.data_root)
frame_cache = Path("/kaggle/working/frame_cache")

print("Building training dataset...")
train_ds = build_hf_dataset(data_root, TRAIN_SUBJECTS, frame_cache)
print(f"  Train: {len(train_ds)} examples")

print("Building validation dataset...")
val_ds = build_hf_dataset(data_root, VAL_SUBJECTS, frame_cache)
print(f"  Val:   {len(val_ds)} examples")

# Sanity check
s = train_ds[0]
print(f"\nSample: {s['clip_id']}")
print(f"Operation: {s['operation']} â†’ Next: {s['next_operation']}")
print(f"Turns: {[m['role'] for m in s['messages']]}")

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

class Collator:
    """Qwen2-VL multimodal collator: converts dataset rows to model input batches."""
    def __init__(self, proc, max_len=2048):
        self.proc    = proc
        self.max_len = max_len

    def __call__(self, examples):
        texts = []
        imgs  = []
        for ex in examples:
            t = self.proc.apply_chat_template(
                ex["messages"], tokenize=False, add_generation_prompt=False
            )
            texts.append(t)
            imgs.append(ex.get("images", []))

        batch = self.proc(
            text=texts,
            images=imgs if any(imgs) else None,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        labels = batch["input_ids"].clone()
        labels[labels == self.proc.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch

collator = Collator(processor, max_len=cfg.max_seq_len)

train_args = TrainingArguments(
    output_dir                  = cfg.output_dir,
    per_device_train_batch_size = cfg.batch_size,
    gradient_accumulation_steps = cfg.grad_accum,     # effective batch = 16
    per_device_eval_batch_size  = 1,
    fp16                        = True,
    optim                       = "adamw_torch",
    learning_rate               = cfg.lr,
    weight_decay                = cfg.weight_decay,
    warmup_ratio                = cfg.warmup,
    lr_scheduler_type           = "cosine",
    num_train_epochs            = cfg.epochs,
    gradient_checkpointing      = cfg.grad_ckpt,      # Flag 3
    save_strategy               = "steps",
    save_steps                  = cfg.save_steps,
    save_total_limit            = cfg.save_limit,
    eval_strategy               = "steps",
    eval_steps                  = cfg.eval_steps,
    logging_steps               = cfg.log_steps,
    remove_unused_columns       = False,
    report_to                   = "none",
    seed                        = 42,
)

print(f"Effective batch size: {cfg.batch_size * cfg.grad_accum}")

In [None]:
from pathlib import Path

# Check for existing checkpoint to resume from
ckpt_dir   = Path(cfg.output_dir)
resume_ckpt = None
checkpoints = sorted(ckpt_dir.glob("checkpoint-*"))
if checkpoints:
    resume_ckpt = str(checkpoints[-1])
    print(f"Resuming from: {resume_ckpt}")
else:
    print("Starting fresh training")

trainer = SFTTrainer(
    model         = model,
    args          = train_args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    data_collator = collator,
    tokenizer     = processor.tokenizer,
)

print("Starting QLoRA fine-tuning...")
result = trainer.train(resume_from_checkpoint=resume_ckpt)

# Save final checkpoint
final = f"{cfg.output_dir}/lora_final"
model.save_pretrained(final)
processor.save_pretrained(final)
print(f"\nâœ“ Done! Checkpoint saved â†’ {final}")
print("\nMetrics:", result.metrics)

In [None]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        peak = torch.cuda.max_memory_allocated(i) / 1e9
        print(f"GPU {i} peak: {peak:.2f} GB")

    print(f"\nVRAM estimate (Cell 4): {total_vram_gb:.2f} GB")
    ratio = peak / total_vram_gb
    print(f"Ratio actual/estimate:  {ratio:.2f}Ã—")
    status = "âœ“ Self-consistent" if ratio < 1.5 else "âš  Underestimated"
    print(status)

In [None]:
from PIL import Image
import json

model.eval()

# Create dummy test image
test_imgs = [Image.new("RGB", (336, 336), color=(100, 80, 60)) for _ in range(8)]

messages = [{"role": "user", "content": [
    *[{"type": "image", "image": im} for im in test_imgs],
    {"type": "text", "text":
        'Analyze this warehouse packaging video. Reply with JSON: '
        '{"dominant_operation":"<op>","temporal_segment":{"start_frame":0,"end_frame":0},'
        '"anticipated_next_operation":"<op>","confidence":0.9}'}
]}]

from qwen_vl_utils import process_vision_info
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
img_inp, vid_inp = process_vision_info(messages)
inputs = processor(text=[text], images=img_inp, videos=vid_inp, return_tensors="pt")
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=200, do_sample=False)

resp = processor.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
print("Model response:\n", resp)