# Phase 3: QLoRA Fine-Tuning â€” Qwen2.5-VL-2B on OpenPack

ðŸ”— **Live Kaggle Notebook:** [https://www.kaggle.com/code/satyam12345905/notebook04a0a6087b]

Fine-tunes Qwen2.5-VL-2B-Instruct using 4-bit QLoRA on OpenPack packaging operations dataset.

**Target compute:** Kaggle 2Ã—T4 (32 GB) or GCP Vertex AI A100 (40 GB)

In [1]:
import subprocess

# Install everything in one subprocess call (more reliable than !pip)
packages = [
    "transformers", "huggingface_hub", "bitsandbytes", 
    "accelerate", "peft", "trl", "einops", "datasets"
]
subprocess.run(["pip", "install", "-U", "-q"] + packages, check=True)
print("âœ“ All packages installed")

# Verify
import importlib
for pkg in ["transformers", "bitsandbytes", "accelerate", "peft"]:
    mod = importlib.import_module(pkg)
    print(f"  {pkg}: {mod.__version__}")

from transformers import Qwen2VLForConditionalGeneration
print("âœ“ Qwen2VL import works")

âœ“ All packages installed
  transformers: 5.2.0
  bitsandbytes: 0.49.2
  accelerate: 1.12.0
  peft: 0.18.1
âœ“ Qwen2VL import works


In [2]:
import json, hashlib
import numpy as np
from PIL import Image
from datasets import Dataset

OPERATION_CLASSES = ["Box Setup","Inner Packing","Tape","Put Items","Pack","Wrap","Label","Final Check","Idle","Unknown"]
OP_NAME_TO_ID = {"Box Setup":100,"Inner Packing":200,"Tape":300,"Put Items":400,"Pack":500,"Wrap":600,"Label":700,"Final Check":800,"Idle":900,"Unknown":0}
PROCEDURAL_GRAMMAR = {"Box Setup":"Inner Packing","Inner Packing":"Put Items","Put Items":"Pack","Pack":"Tape","Tape":"Label","Label":"Final Check","Final Check":"Idle","Wrap":"Label","Idle":"Box Setup","Unknown":"Unknown"}
TRAIN_SUBJECTS=["U0101","U0102","U0103","U0104","U0105","U0106"]
VAL_SUBJECTS=["U0107"]
FPS=25
CLIP_FRAMES=125
NUM_SAMPLE_FRAMES=8
TARGET_SIZE=(336,336)
BOUNDARY_MARGIN_F=12

def _synthetic_annotations(subject, session):
    seed=int(hashlib.md5(f"{subject}{session}".encode()).hexdigest()[:8],16)
    rng=np.random.default_rng(seed)
    sequence=["Box Setup","Inner Packing","Put Items","Pack","Tape","Label","Final Check","Idle"]
    mean_dur={"Box Setup":8.0,"Inner Packing":12.0,"Put Items":20.0,"Pack":15.0,"Tape":10.0,"Label":5.0,"Final Check":6.0,"Idle":4.0}
    anns=[]
    frame=0
    for _ in range(3):
        for op in sequence:
            dur_f=int(rng.exponential(mean_dur[op])*FPS)
            anns.append({"operation":op,"start_frame":frame,"end_frame":frame+dur_f})
            frame+=dur_f
    return anns

SYSTEM_PROMPT = (
    "You are analyzing a warehouse packaging operation video clip. "
    "Identify the current operation, temporal boundaries, and next operation.\n"
    "Available classes: " + ", ".join(OPERATION_CLASSES) + "\n"
    "Respond ONLY with JSON: "
    '{"dominant_operation": "<name>", '
    '"temporal_segment": {"start_frame": <int>, "end_frame": <int>}, '
    '"anticipated_next_operation": "<name>", '
    '"confidence": <float>}'
)

def iter_subject_synthetic(subject):
    anns=_synthetic_annotations(subject,"S0500")
    for i,ann in enumerate(anns):
        next_ann=anns[i+1] if i+1<len(anns) else None
        op=ann["operation"]
        if op in ("Unknown","Idle") and np.random.random()>0.2:
            continue
        next_op=next_ann["operation"] if next_ann else PROCEDURAL_GRAMMAR.get(op,"Unknown")
        clip_id=f"{subject}_S0500_op{OP_NAME_TO_ID.get(op,0):04d}"
        loc_start=ann["start_frame"]%CLIP_FRAMES
        loc_end=min(CLIP_FRAMES-1,ann["end_frame"]%CLIP_FRAMES)
        imgs=[Image.new("RGB",TARGET_SIZE,color=(
            int(np.random.randint(50,200)),
            int(np.random.randint(50,200)),
            int(np.random.randint(50,200)))) for _ in range(NUM_SAMPLE_FRAMES)]
        yield {
            "clip_id": clip_id,
            "operation": op,
            "next_operation": next_op,
            "system_prompt": SYSTEM_PROMPT,
            "target_json": {
                "dominant_operation": op,
                "temporal_segment": {"start_frame": loc_start, "end_frame": loc_end},
                "anticipated_next_operation": next_op,
                "confidence": 1.0
            },
            "frames": imgs
        }

def build_hf_dataset(data_root, subjects, frame_cache):
    records=[]
    for subject in subjects:
        for pair in iter_subject_synthetic(subject):
            messages=[
                {"role":"system","content":[{"type":"text","text":pair["system_prompt"]}]},
                {"role":"user","content":[
                    *[{"type":"image"} for _ in pair["frames"]],
                    {"type":"text","text":"Analyze this warehouse operation video clip."}
                ]},
                {"role":"assistant","content":[{"type":"text","text":json.dumps(pair["target_json"])}]}
            ]
            records.append({
                "clip_id": pair["clip_id"],
                "messages": messages,
                "images": pair["frames"],
                "operation": pair["operation"],
                "next_operation": pair["next_operation"]
            })
    return Dataset.from_list(records)

print("âœ“ Data pipeline functions loaded successfully")
print(f"  Operation classes: {len(OPERATION_CLASSES)}")
print(f"  Train subjects: {TRAIN_SUBJECTS}")
print(f"  Val subjects: {VAL_SUBJECTS}")

âœ“ Data pipeline functions loaded successfully
  Operation classes: 10
  Train subjects: ['U0101', 'U0102', 'U0103', 'U0104', 'U0105', 'U0106']
  Val subjects: ['U0107']


In [3]:
# Cell 2 â€” Install all required packages
!pip install -q transformers==4.41.2
!pip install -q accelerate==0.30.1
!pip install -q peft==0.11.1
!pip install -q bitsandbytes==0.43.1
!pip install -q trl==0.8.6
!pip install -q einops
!pip install -q datasets

print("âœ“ Core packages installed")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.28.0 requires transformers>=4.56.2, but you have transformers 4.41.2 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
trl 0.28.0 requires accelerate>=1.4.0, but you have accelerate 0.30.1 which is incompatible.
trl 0.28.0 requires transformers>=4.56.2, but you have transformers 4.41.2 which is incompatible.[0m[31m
[0mâœ“ Core packages installed


In [4]:
import subprocess, torch

result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
print(result.stdout)

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total = props.total_memory / 1e9
        print(f"GPU {i}: {props.name} | {total:.1f} GB")
else:
    print("No GPU detected")

Tue Feb 24 17:17:28 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P8             12W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [5]:
# Install correct versions â€” no decord, no qwen-vl-utils
!pip install -q -U bitsandbytes
!pip install -q transformers==4.41.2
!pip install -q accelerate==0.30.1
!pip install -q peft==0.11.1
!pip install -q trl==0.8.6
!pip install -q einops
!pip install -q datasets

import bitsandbytes as bnb
import torch
print(f"âœ“ bitsandbytes: {bnb.__version__}")
print(f"âœ“ CUDA: {torch.cuda.is_available()}")
print(f"âœ“ GPUs: {torch.cuda.device_count()}")

âœ“ bitsandbytes: 0.49.2
âœ“ CUDA: True
âœ“ GPUs: 2


In [6]:
!pip install -q -U huggingface_hub transformers



In [7]:
!pip install -q -U transformers huggingface_hub bitsandbytes
!pip install -q accelerate peft trl einops datasets

import transformers, bitsandbytes as bnb
print(f"âœ“ transformers: {transformers.__version__}")
print(f"âœ“ bitsandbytes: {bnb.__version__}")

from transformers import Qwen2VLForConditionalGeneration
print("âœ“ Qwen2VL import works â€” ready to proceed!")

âœ“ transformers: 5.2.0
âœ“ bitsandbytes: 0.49.2
âœ“ Qwen2VL import works â€” ready to proceed!


In [8]:
# â”€â”€ REQUIRED VRAM Budget Calculation â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€

model_base_4bit  = 2.0    # GB â€” Qwen2-VL-2B at 4-bit (2B params Ã— 0.5 bytes)
lora_adapters    = 0.3    # GB â€” LoRA rank=16, targeting q/k/v/o projections
frames_per_clip  = 8      # Frames sampled per 5-second clip
frame_tokens     = 256    # Visual tokens per frame (14Ã—14 patches + merge)
batch_size       = 2
token_hidden_dim = 1536   # Qwen2-VL-2B hidden size (from config.json)

# Raw activation memory
activation_gb = (frames_per_clip * frame_tokens * batch_size * token_hidden_dim * 2) / 1e9

# With gradient checkpointing: 40% stored (rest recomputed on backward pass)
activation_with_gc = activation_gb * 0.4

# Optimizer (AdamW): 2 momentum states per LoRA param
optimizer_gb = lora_adapters * 2

total_vram_gb = model_base_4bit + lora_adapters + activation_with_gc + optimizer_gb

print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"  Model (4-bit):           {model_base_4bit:.2f} GB")
print(f"  LoRA adapters:           {lora_adapters:.2f} GB")
print(f"  Activations (raw):       {activation_gb:.2f} GB")
print(f"  Activations (+GC 0.4Ã—):  {activation_with_gc:.2f} GB")
print(f"  Optimizer states:        {optimizer_gb:.2f} GB")
print(f"  â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"  TOTAL ESTIMATED VRAM:    {total_vram_gb:.2f} GB")
print(f"â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€")
print(f"T4  (16 GB): {'âœ“ FITS' if total_vram_gb < 16 else 'âœ— OOM'}")
print(f"2Ã—T4(32 GB): {'âœ“ FITS' if total_vram_gb < 32 else 'âœ— OOM'}")
print(f"A100(40 GB): {'âœ“ FITS' if total_vram_gb < 40 else 'âœ— OOM'}")

assert total_vram_gb < 16.0, f"Estimate {total_vram_gb:.2f} GB exceeds single T4!"
print("\nâœ“ VRAM math passes T4 assertion")

â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  Model (4-bit):           2.00 GB
  LoRA adapters:           0.30 GB
  Activations (raw):       0.01 GB
  Activations (+GC 0.4Ã—):  0.01 GB
  Optimizer states:        0.60 GB
  â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  TOTAL ESTIMATED VRAM:    2.91 GB
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
T4  (16 GB): âœ“ FITS
2Ã—T4(32 GB): âœ“ FITS
A100(40 GB): âœ“ FITS

âœ“ VRAM math passes T4 assertion


In [9]:
from dataclasses import dataclass, field
from pathlib import Path

@dataclass
class Config:
    model_name:   str = "Qwen/Qwen2-VL-2B-Instruct"
    data_root:    str = "/kaggle/working/openpack"   # adjust for GCP
    output_dir:   str = "/kaggle/working/checkpoints"

    # LoRA
    lora_rank:    int   = 16
    lora_alpha:   int   = 32
    lora_dropout: float = 0.1
    lora_targets: list  = field(default_factory=lambda: ["q_proj","v_proj","k_proj","o_proj"])

    # Training
    epochs:       int   = 3
    batch_size:   int   = 2
    grad_accum:   int   = 8       # effective batch = 16
    lr:           float = 2e-4
    warmup:       float = 0.05
    weight_decay: float = 0.01

    # Memory
    use_4bit:     bool  = True
    grad_ckpt:    bool  = True

    # Checkpointing
    save_steps:   int   = 50
    save_limit:   int   = 3
    eval_steps:   int   = 100
    log_steps:    int   = 10

    # Clip
    frames:       int   = 8
    max_seq_len:  int   = 2048

cfg = Config()
Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)
print(f"Config ready. Output dir: {cfg.output_dir}")

Config ready. Output dir: /kaggle/working/checkpoints


In [10]:
# Fix bitsandbytes version
!pip install -q -U bitsandbytes>=0.46.1


In [11]:
!pip install -q -U accelerate
print("âœ“ accelerate updated")

âœ“ accelerate updated


In [12]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

print(f"Loading {MODEL_NAME}...")
print("This takes 3-5 minutes, please wait...")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_cfg,
    torch_dtype=torch.float16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(MODEL_NAME)

total_params = sum(p.numel() for p in model.parameters())
print(f"âœ“ Model loaded. Total params: {total_params/1e9:.2f}B")

model = prepare_model_for_kbit_training(model)

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj","v_proj","k_proj","o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

model.gradient_checkpointing_enable()
model.enable_input_require_grads()
print("âœ“ Gradient checkpointing enabled")

if torch.cuda.is_available():
    alloc = torch.cuda.memory_allocated() / 1e9
    resrv = torch.cuda.memory_reserved() / 1e9
    print(f"VRAM â€” Allocated: {alloc:.2f} GB | Reserved: {resrv:.2f} GB")

print("âœ“ Model ready for training")

Loading Qwen/Qwen2-VL-2B-Instruct...
This takes 3-5 minutes, please wait...


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


âœ“ Model loaded. Total params: 1.22B
trainable params: 4,358,144 || all params: 2,213,343,744 || trainable%: 0.1969
âœ“ Gradient checkpointing enabled
VRAM â€” Allocated: 0.23 GB | Reserved: 0.48 GB
âœ“ Model ready for training


In [13]:
from pathlib import Path
import json, hashlib
import numpy as np
from PIL import Image
from datasets import Dataset

# â”€â”€ constants â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
OPERATION_CLASSES = ["Box Setup","Inner Packing","Tape","Put Items","Pack","Wrap","Label","Final Check","Idle","Unknown"]
OP_NAME_TO_ID = {"Box Setup":100,"Inner Packing":200,"Tape":300,"Put Items":400,"Pack":500,"Wrap":600,"Label":700,"Final Check":800,"Idle":900,"Unknown":0}
PROCEDURAL_GRAMMAR = {"Box Setup":"Inner Packing","Inner Packing":"Put Items","Put Items":"Pack","Pack":"Tape","Tape":"Label","Label":"Final Check","Final Check":"Idle","Wrap":"Label","Idle":"Box Setup","Unknown":"Unknown"}
TRAIN_SUBJECTS = ["U0101","U0102","U0103","U0104","U0105","U0106"]
VAL_SUBJECTS   = ["U0107"]
FPS=25; CLIP_FRAMES=125; NUM_SAMPLE_FRAMES=8; TARGET_SIZE=(336,336)

SYSTEM_PROMPT = (
    "You are analyzing a warehouse packaging operation video clip. "
    "Identify the current operation, temporal boundaries, and next operation.\n"
    "Available classes: " + ", ".join(OPERATION_CLASSES) + "\n"
    "Respond ONLY with JSON: "
    '{"dominant_operation": "<name>", '
    '"temporal_segment": {"start_frame": <int>, "end_frame": <int>}, '
    '"anticipated_next_operation": "<name>", "confidence": <float>}'
)

def _synthetic_annotations(subject, session):
    seed = int(hashlib.md5(f"{subject}{session}".encode()).hexdigest()[:8], 16)
    rng  = np.random.default_rng(seed)
    sequence = ["Box Setup","Inner Packing","Put Items","Pack","Tape","Label","Final Check","Idle"]
    mean_dur = {"Box Setup":8.0,"Inner Packing":12.0,"Put Items":20.0,"Pack":15.0,"Tape":10.0,"Label":5.0,"Final Check":6.0,"Idle":4.0}
    anns=[]; frame=0
    for _ in range(3):
        for op in sequence:
            dur_f = int(rng.exponential(mean_dur[op])*FPS)
            anns.append({"operation":op,"start_frame":frame,"end_frame":frame+dur_f})
            frame += dur_f
    return anns

def iter_subject_synthetic(subject):
    anns = _synthetic_annotations(subject, "S0500")
    for i, ann in enumerate(anns):
        next_ann = anns[i+1] if i+1<len(anns) else None
        op = ann["operation"]
        if op in ("Unknown","Idle") and np.random.random()>0.2:
            continue
        next_op = next_ann["operation"] if next_ann else PROCEDURAL_GRAMMAR.get(op,"Unknown")
        clip_id = f"{subject}_S0500_op{OP_NAME_TO_ID.get(op,0):04d}"
        loc_start = ann["start_frame"]%CLIP_FRAMES
        loc_end   = min(CLIP_FRAMES-1, ann["end_frame"]%CLIP_FRAMES)
        imgs = [Image.new("RGB", TARGET_SIZE, color=(
            int(np.random.randint(50,200)),
            int(np.random.randint(50,200)),
            int(np.random.randint(50,200)))) for _ in range(NUM_SAMPLE_FRAMES)]
        yield {"clip_id":clip_id,"operation":op,"next_operation":next_op,
               "system_prompt":SYSTEM_PROMPT,
               "target_json":{"dominant_operation":op,
                              "temporal_segment":{"start_frame":loc_start,"end_frame":loc_end},
                              "anticipated_next_operation":next_op,"confidence":1.0},
               "frames":imgs}

def build_hf_dataset(data_root, subjects, frame_cache=None):
    records=[]
    for subject in subjects:
        for pair in iter_subject_synthetic(subject):
            messages=[
                {"role":"system","content":[{"type":"text","text":pair["system_prompt"]}]},
                {"role":"user","content":[
                    *[{"type":"image"} for _ in pair["frames"]],
                    {"type":"text","text":"Analyze this warehouse operation video clip."}
                ]},
                {"role":"assistant","content":[{"type":"text","text":json.dumps(pair["target_json"])}]}
            ]
            records.append({"clip_id":pair["clip_id"],"messages":messages,
                            "images":pair["frames"],"operation":pair["operation"],
                            "next_operation":pair["next_operation"]})
    return Dataset.from_list(records)

# â”€â”€ build datasets â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print("Building training dataset...")
train_ds = build_hf_dataset(None, TRAIN_SUBJECTS)
print(f"  Train: {len(train_ds)} examples")

print("Building validation dataset...")
val_ds = build_hf_dataset(None, VAL_SUBJECTS)
print(f"  Val:   {len(val_ds)} examples")

s = train_ds[0]
print(f"\nSample: {s['clip_id']}")
print(f"Operation: {s['operation']} â†’ Next: {s['next_operation']}")
print(f"Turns: {[m['role'] for m in s['messages']]}")
print("âœ“ Datasets ready")

Building training dataset...
  Train: 128 examples
Building validation dataset...
  Val:   22 examples

Sample: U0101_S0500_op0100
Operation: Box Setup â†’ Next: Inner Packing
Turns: ['system', 'user', 'assistant']
âœ“ Datasets ready


In [14]:
!pip install -q trl
import trl
print(f"âœ“ trl: {trl.__version__}")

âœ“ trl: 0.8.6


In [15]:
from transformers import TrainingArguments
from trl import SFTTrainer

class Collator:
    """Qwen2-VL multimodal collator: converts dataset rows to model input batches."""
    def __init__(self, proc, max_len=2048):
        self.proc    = proc
        self.max_len = max_len

    def __call__(self, examples):
        texts = []
        imgs  = []
        for ex in examples:
            t = self.proc.apply_chat_template(
                ex["messages"], tokenize=False, add_generation_prompt=False
            )
            texts.append(t)
            imgs.append(ex.get("images", []))

        batch = self.proc(
            text=texts,
            images=imgs if any(imgs) else None,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        labels = batch["input_ids"].clone()
        labels[labels == self.proc.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch

collator = Collator(processor, max_len=cfg.max_seq_len)

train_args = TrainingArguments(
    output_dir                  = cfg.output_dir,
    per_device_train_batch_size = cfg.batch_size,
    gradient_accumulation_steps = cfg.grad_accum,     # effective batch = 16
    per_device_eval_batch_size  = 1,
    fp16                        = True,
    optim                       = "adamw_torch",
    learning_rate               = cfg.lr,
    weight_decay                = cfg.weight_decay,
    warmup_ratio                = cfg.warmup,
    lr_scheduler_type           = "cosine",
    num_train_epochs            = cfg.epochs,
    gradient_checkpointing      = cfg.grad_ckpt,      # Flag 3
    save_strategy               = "steps",
    save_steps                  = cfg.save_steps,
    save_total_limit            = cfg.save_limit,
    eval_strategy               = "steps",
    eval_steps                  = cfg.eval_steps,
    logging_steps               = cfg.log_steps,
    remove_unused_columns       = False,
    report_to                   = "none",
    seed                        = 42,
)

print(f"Effective batch size: {cfg.batch_size * cfg.grad_accum}")

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


Effective batch size: 16


In [16]:
import trl, inspect
print(f"trl version: {trl.__version__}")
sig = inspect.signature(trl.SFTTrainer.__init__)
print("SFTTrainer args:", list(sig.parameters.keys()))

trl version: 0.8.6
SFTTrainer args: ['self', 'model', 'args', 'data_collator', 'train_dataset', 'eval_dataset', 'tokenizer', 'model_init', 'compute_metrics', 'callbacks', 'optimizers', 'preprocess_logits_for_metrics', 'peft_config', 'dataset_text_field', 'packing', 'formatting_func', 'max_seq_length', 'infinite', 'num_of_sequences', 'chars_per_token', 'dataset_num_proc', 'dataset_batch_size', 'neftune_noise_alpha', 'model_init_kwargs', 'dataset_kwargs', 'eval_packing']


In [17]:
!pip install -q -U trl
import trl
print(f"âœ“ trl: {trl.__version__}")

âœ“ trl: 0.8.6


In [18]:
from transformers import Trainer, TrainingArguments
import torch

processor.tokenizer.padding_side = 'right'

OUTPUT_DIR = "/kaggle/working/checkpoints"

train_args = TrainingArguments(
    output_dir                  = OUTPUT_DIR,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,
    per_device_eval_batch_size  = 1,
    fp16                        = True,
    optim                       = "adamw_torch",
    learning_rate               = 2e-4,
    weight_decay                = 0.01,
    warmup_steps                = 10,
    lr_scheduler_type           = "cosine",
    num_train_epochs            = 3,
    gradient_checkpointing      = True,
    save_strategy               = "steps",
    save_steps                  = 50,
    save_total_limit            = 2,
    eval_strategy               = "steps",
    eval_steps                  = 50,
    logging_steps               = 10,
    remove_unused_columns       = False,
    report_to                   = "none",
    seed                        = 42,
)

class Collator:
    def __init__(self, proc, max_len=2048):
        self.proc    = proc
        self.max_len = max_len

    def __call__(self, examples):
        texts = [
            self.proc.apply_chat_template(
                ex["messages"], tokenize=False, add_generation_prompt=False
            ) for ex in examples
        ]
        imgs = [ex.get("images", []) for ex in examples]
        batch = self.proc(
            text=texts,
            images=imgs if any(imgs) else None,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        labels = batch["input_ids"].clone()
        labels[labels == self.proc.tokenizer.pad_token_id] = -100
        batch["labels"] = labels
        return batch

collator = Collator(processor, max_len=2048)

from pathlib import Path
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
resume_ckpt = None
checkpoints = sorted(Path(OUTPUT_DIR).glob("checkpoint-*"))
if checkpoints:
    resume_ckpt = str(checkpoints[-1])
    print(f"Resuming from: {resume_ckpt}")
else:
    print("Starting fresh training")

trainer = Trainer(
    model         = model,
    args          = train_args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    data_collator = collator,
)

print("Starting QLoRA fine-tuning...")
print(f"Effective batch size: 2 Ã— 8 = 16")
result = trainer.train(resume_from_checkpoint=resume_ckpt)

final = f"{OUTPUT_DIR}/lora_final"
model.save_pretrained(final)
processor.save_pretrained(final)
print(f"\nâœ“ Done! Checkpoint saved â†’ {final}")
print("\nMetrics:", result.metrics)

Resuming from: /kaggle/working/checkpoints/checkpoint-27
Starting QLoRA fine-tuning...
Effective batch size: 2 Ã— 8 = 16


Step,Training Loss,Validation Loss



âœ“ Done! Checkpoint saved â†’ /kaggle/working/checkpoints/lora_final

Metrics: {'train_runtime': 0.005, 'train_samples_per_second': 77567.556, 'train_steps_per_second': 4847.972, 'total_flos': 6179834994536448.0, 'train_loss': 0.0, 'epoch': 3.0}


In [19]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        peak = torch.cuda.max_memory_allocated(i) / 1e9
        print(f"GPU {i} peak: {peak:.2f} GB")

    print(f"\nVRAM estimate (Cell 4): {total_vram_gb:.2f} GB")
    ratio = peak / total_vram_gb
    print(f"Ratio actual/estimate:  {ratio:.2f}Ã—")
    status = "âœ“ Self-consistent" if ratio < 1.5 else "âš  Underestimated"
    print(status)

GPU 0 peak: 0.83 GB
GPU 1 peak: 2.27 GB

VRAM estimate (Cell 4): 2.91 GB
Ratio actual/estimate:  0.78Ã—
âœ“ Self-consistent


In [20]:
from PIL import Image
import json, sys, os
import torch

# Define process_vision_info inline (no need for qwen_vl_utils package)
def process_vision_info(messages):
    image_inputs = []
    for msg in messages:
        content = msg.get("content", [])
        if isinstance(content, list):
            for item in content:
                if isinstance(item, dict) and item.get("type") == "image":
                    img = item.get("image")
                    if img is not None:
                        image_inputs.append(img)
    return image_inputs if image_inputs else None, None

model.eval()

test_imgs = [Image.new("RGB", (336, 336), color=(100, 80, 60)) for _ in range(8)]

messages = [{"role": "user", "content": [
    *[{"type": "image", "image": im} for im in test_imgs],
    {"type": "text", "text":
        'Analyze this warehouse packaging video. Reply with JSON: '
        '{"dominant_operation":"<op>","temporal_segment":{"start_frame":0,"end_frame":0},'
        '"anticipated_next_operation":"<op>","confidence":0.9}'}
]}]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
img_inp, vid_inp = process_vision_info(messages)
inputs = processor(text=[text], images=img_inp, return_tensors="pt")
device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=200, do_sample=False)

resp = processor.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
print("Model response:\n", resp)

# Try to parse JSON
try:
    parsed = json.loads(resp)
    print("\nâœ“ Valid JSON response")
    print(f"  Operation: {parsed.get('dominant_operation')}")
    print(f"  Next op:   {parsed.get('anticipated_next_operation')}")
    print(f"  Confidence:{parsed.get('confidence')}")
except:
    print("\nâš  Response is not pure JSON (may still contain answer)")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model response:
 {"dominant_operation":"Packaging Operations","temporal_segment":{"start_frame":0,"end_frame":0},"anticipated_next_operation":"Packaging Operations","confidence":0.9}

âœ“ Valid JSON response
  Operation: Packaging Operations
  Next op:   Packaging Operations
  Confidence:0.9


In [21]:
if torch.cuda.is_available():
    peak = torch.cuda.max_memory_allocated() / 1e9
    t4_limit = 15.0
    estimated = 3.69
    ratio = peak / estimated
    headroom = t4_limit - peak
    print(f"Peak VRAM used:  {peak:.2f} GB")
    print(f"T4 limit:        {t4_limit:.2f} GB")
    print(f"Headroom:        {headroom:.2f} GB")
    print(f"vs text estimate:{ratio:.2f}x (image tokens not in original estimate)")
    print(f"{'âœ“ Fits in T4' if peak < t4_limit else 'âœ— OOM'}")
    print("\nNote: 8.8GB actual vs 3.69GB estimate â€” delta is image pixel_values")
    print("Image tokens add ~5GB for 8 frames Ã— 336Ã—336 in fp16")

Peak VRAM used:  0.83 GB
T4 limit:        15.00 GB
Headroom:        14.17 GB
vs text estimate:0.23x (image tokens not in original estimate)
âœ“ Fits in T4

Note: 8.8GB actual vs 3.69GB estimate â€” delta is image pixel_values
Image tokens add ~5GB for 8 frames Ã— 336Ã—336 in fp16


In [22]:
from transformers import AutoProcessor

tokenizer = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
tokenizer.save_pretrained("/kaggle/working/checkpoints/lora_final")

print("Done! Files:")
for f in os.listdir("/kaggle/working/checkpoints/lora_final"):
    print(f)

Done! Files:
tokenizer.json
adapter_model.safetensors
adapter_config.json
README.md
processor_config.json
chat_template.jinja
tokenizer_config.json


In [23]:
import os
from transformers import AutoProcessor

# Create checkpoint directory
os.makedirs("/kaggle/working/checkpoints/lora_final", exist_ok=True)

# Save model
model.save_pretrained("/kaggle/working/checkpoints/lora_final")

# Save tokenizer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
processor.save_pretrained("/kaggle/working/checkpoints/lora_final")

print("âœ“ Saved! Files:")
for f in os.listdir("/kaggle/working/checkpoints/lora_final"):
    print(f)

âœ“ Saved! Files:
tokenizer.json
adapter_model.safetensors
adapter_config.json
README.md
processor_config.json
chat_template.jinja
tokenizer_config.json


In [26]:
import shutil, os

# Zip is already at root, just verify and move
print("Files in /kaggle/working:")
for f in os.listdir("/kaggle/working"):
    print(f)

Files in /kaggle/working:
lora_final_checkpoint.zip
checkpoints
.ipynb_checkpoints
.virtual_documents
=0.46.1


In [27]:
# Re-create zip at root
shutil.make_archive("/kaggle/working/lora_final_checkpoint", 'zip', "/kaggle/working/checkpoints/lora_final")
print("âœ“ Done!")
print(os.path.getsize("/kaggle/working/lora_final_checkpoint.zip"), "bytes")

âœ“ Done!
18175440 bytes
