In [1]:
!pip install torch transformers peft datasets accelerate bitsandbytes qwen_vl_utils
print("✅ DONE!")

✅ DONE!


In [2]:
import torch
import os
from datasets import load_dataset
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from qwen_vl_utils import process_vision_info

# --- 1. CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
OUTPUT_DIR = "./qwen-vl-2b-finetune"
DATA_JSON = "qa.json"
IMAGE_FOLDER = "images" # Matches your screenshot folder name

# --- 2. LOAD PROCESSOR & MODEL ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print(f"Loading {MODEL_ID}...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=False)

# --- 3. APPLY PEFT (LORA) ---
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

# --- 4. DATA FORMATTING ---
def format_data(example):
    q_text = example.get("question", "")
    a_text = example.get("answer", "")
    img_name = example.get("image", None)

    # Logic to find images in your 'images/' folder
    full_image_path = None
    if img_name:
        full_image_path = os.path.join(IMAGE_FOLDER, img_name)

    user_content = [{"type": "text", "text": q_text}]
    if full_image_path and os.path.exists(full_image_path):
        user_content.insert(0, {"type": "image", "image": full_image_path})

    messages = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": [{"type": "text", "text": a_text}]}
    ]

    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding="max_length",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    )

    return {
        "input_ids": inputs["input_ids"].squeeze(0),
        "attention_mask": inputs["attention_mask"].squeeze(0),
        "labels": inputs["input_ids"].squeeze(0),
        "pixel_values": inputs.get("pixel_values", None)
    }

# --- 5. LOAD & TRAIN ---
print("Loading dataset...")
dataset = load_dataset("json", data_files=DATA_JSON, split="train")
dataset = dataset.map(format_data, batched=False)

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False,
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),
        'labels': torch.stack([torch.tensor(f['labels']) for f in data]),
        'pixel_values': torch.cat([f['pixel_values'] if isinstance(f['pixel_values'], torch.Tensor) else torch.tensor(f['pixel_values']) for f in data]) if data[0]['pixel_values'] is not None else None
    }
)

print("Starting training...")
trainer.train()

# --- 6. SAVE ---
print("Saving model...")
model.save_pretrained(f"{OUTPUT_DIR}/final_adapter")
processor.save_pretrained(f"{OUTPUT_DIR}/final_adapter")
print("✅ DONE! Model saved.")

Loading Qwen/Qwen2-VL-2B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]



trainable params: 18,464,768 || all params: 2,227,450,368 || trainable%: 0.8290
Loading dataset...
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,2.871709
2,7.077536
3,4.579581
4,12.579129
5,4.74506
6,7.263823
7,2.761194
8,10.698213
9,1.994255
10,3.377796


Saving model...
✅ DONE! Model saved.


In [3]:
import torch
import os
import gc
from peft import PeftModel
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

# --- 1. CONFIGURATION ---
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
ADAPTER_PATH = "./qwen-vl-2b-finetune/final_adapter"
IMAGE_FOLDER = "/content/images"

# --- 2. LOAD MODEL ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
processor = AutoProcessor.from_pretrained(ADAPTER_PATH, use_fast=False)

# --- 3. RUN LOOP FOR ALL IMAGES ---
all_images = [f for f in os.listdir(IMAGE_FOLDER) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

if not all_images:
    print("❌ No images found!")
else:
    print(f"🚀 Starting inference for {len(all_images)} images...")

    for img_name in all_images:
        img_path = os.path.join(IMAGE_FOLDER, img_name)
        print(f"\nProcessing: {img_name}...")

        messages = [
            {
                "role": "user",
                "content": [
                    # UPDATED: Added your specific min and max pixel limits here
                    {
                        "type": "image",
                        "image": img_path,
                        "min_pixels": 256 * 28 * 28,
                        "max_pixels": 640 * 28 * 28
                    },
                    {"type": "text", "text": "Describe this image brief."},
                ],
            }
        ]

        # Prepare Inputs
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        ).to("cuda")

        # Cast to float16 and free cache
        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=64)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )

        print(f"Result: {output_text[0]}")

        # CLEAR MEMORY AFTER EVERY IMAGE
        del inputs
        torch.cuda.empty_cache()
        gc.collect()

print("\n✅ DONE! All images processed successfully.")

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/729 [00:00<?, ?it/s]

🚀 Starting inference for 35 images...

Processing: INK-TIID-NGG-3001-Rev3-OVERALL PROCESS FLOW - Copy-1 - Copy.jpg.jpeg...
Result: This image is a detailed schematic drawing of a facility, likely related to manufacturing or industrial processes. The drawing includes various components such as tanks, pipelines, valves, and other equipment. Here is a detailed description of the components present in the image:

1. **Solvent Storage Tanks**:
   - **Solvent Storage

Processing: INK-TIID-NGG-3007-Rev3-P&ID VARNISH LINE-2.jpg.jpeg...
Result: This image is a detailed electrical schematic for a building, specifically the first floor. The schematic includes various components such as switches, relays, and wiring diagrams. Here is a detailed description:

1. **Main Electrical Panel:**
   - The main electrical panel is labeled "MAIN BUILDING SCOPE" and includes several

Processing: INK-TIID-NGG-3001-Rev3-OVERALL PROCESS FLOW - Copy-1.jpg.jpeg...
Result: This image is a detailed schematic drawing 