In [1]:
!pip install torch transformers peft datasets accelerate bitsandbytes qwen_vl_utils



In [4]:
import torch
from datasets import load_dataset
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from qwen_vl_utils import process_vision_info
from huggingface_hub import login
import os

# --- 0. LOGIN ---
login(token="") 

# --- 1. CONFIGURATION ---
# Change MODEL_ID to 7B version
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"  
OUTPUT_DIR = "./qwen-vl-7b-finetune"
DATA_JSON = "qa.json"

# --- 2. LOAD PROCESSOR & MODEL ---
from transformers import BitsAndBytesConfig


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    # ADD THIS: Helps with large models on limited memory
    bnb_4bit_use_double_quant=True 
)

# Safe Memory Map (Adjusted for 7B)
# 7B model in 4-bit needs about 6-8GB VRAM. 
# T4 has 16GB, so this mapping is safe.
max_memory_mapping = {
    0: "14GB", 
    "cpu": "30GB"
}

print(f"Loading {MODEL_ID}...")
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    max_memory=max_memory_mapping,
    torch_dtype=torch.float16,
    # ADD THIS: Offloads parts of the model to disk if RAM is full
    low_cpu_mem_usage=True 
)
processor = AutoProcessor.from_pretrained(MODEL_ID)


# --- 3. APPLY PEFT (LORA) ---
peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], # Target both Vision and Language layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    modules_to_save=None
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# --- 4. DATA FORMATTING (Fixed for 'question'/'answer' format) ---
def format_data(example):
    # 1. Extract your specific keys (Handles missing 'image' gracefully)
    q_text = example.get("question", "")
    a_text = example.get("answer", "")
    image_path = example.get("image", None) 
    
    # 2. Build the User Message
    user_content = [{"type": "text", "text": q_text}]
    
    if image_path:
        user_content.insert(0, {"type": "image", "image": image_path})

    # 3. Create the Qwen-style conversation structure
    messages = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": [{"type": "text", "text": a_text}]}
    ]
    
    # 4. Apply the template
    text_input = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    
    # 5. Process Vision (Loads image if present)
    image_inputs, video_inputs = process_vision_info(messages)
    
    # 6. Tokenize
    inputs = processor(
        text=[text_input],
        images=image_inputs,
        videos=video_inputs,
        padding="max_length",
        max_length=512, 
        truncation=True,
        return_tensors="pt",
    )
    
    return {
        "input_ids": inputs["input_ids"].squeeze(0),
        "attention_mask": inputs["attention_mask"].squeeze(0),
        "labels": inputs["input_ids"].squeeze(0),
        "pixel_values": inputs.get("pixel_values", None) # Safely get pixel_values, default to None if not present
    }

# --- 5. LOAD & TRAIN ---
print("Loading dataset...")
dataset = load_dataset("json", data_files=DATA_JSON, split="train")

print("Formatting data...")
dataset = dataset.map(format_data, batched=False)

# 6. Training Arguments
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,   # Must be 1 for images
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    save_strategy="epoch",
    report_to="none",
    remove_unused_columns=False
)

# 7. Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),
        'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),
        'labels': torch.stack([torch.tensor(f['labels']) for f in data]),
        'pixel_values': torch.cat([f['pixel_values'] if isinstance(f['pixel_values'], torch.Tensor) else torch.tensor(f['pixel_values']) for f in data]) if data[0]['pixel_values'] is not None else None
    }
)

# 8. Save
print("Saving model...")
model.save_pretrained(f"{OUTPUT_DIR}/final_adapter")
processor.save_pretrained(f"{OUTPUT_DIR}/final_adapter")
print("✅ DONE! Model saved.")

Loading Qwen/Qwen2-VL-7B-Instruct...


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/730 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

trainable params: 5,046,272 || all params: 8,296,421,888 || trainable%: 0.0608
Loading dataset...
Formatting data...


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Saving model...
✅ DONE! Model saved.


In [7]:
from peft import PeftModel
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
import torch

# --- 1. CONFIGURATION ---
# CHANGE: Updated to 7B model ID
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"

# IMPORTANT: This path must contain an adapter trained on the 7B model!
# If you use your old 2B adapter here, it will give the "Size Mismatch" error again.
ADAPTER_PATH = "./qwen-vl-7b-finetune/final_adapter" 

print("✅ DONE! Configuration set for 7B.")

# --- 2. LOAD 7B BASE MODEL (Memory Optimized) ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True # Extra memory saving for 7B
)

print(f"Loading 7B base model: {MODEL_ID}...")
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True # Helps prevent System RAM crashes
)

print("✅ DONE! 7B Base Model loaded.")

# --- 3. LOAD 7B ADAPTER & PROCESSOR ---
print(f"Applying 7B adapter from: {ADAPTER_PATH}...")

# This will only work if ADAPTER_PATH has weights for the 7B model
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
processor = AutoProcessor.from_pretrained(MODEL_ID)

print("✅ DONE! 7B Adapter and Processor loaded.")

# --- 4. INFERENCE ---
print("Model is ready for 7B inference!")
print("✅ DONE! All blocks executed.")

✅ DONE! Configuration set for 7B.
Loading 7B base model: Qwen/Qwen2-VL-7B-Instruct...


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/730 [00:00<?, ?it/s]

✅ DONE! 7B Base Model loaded.
Applying 7B adapter from: ./qwen-vl-7b-finetune/final_adapter...
✅ DONE! 7B Adapter and Processor loaded.
Model is ready for 7B inference!
✅ DONE! All blocks executed.
