In [None]:
from unsloth import FastVisionModel
import torch
from transformers import AutoProcessor

# 1. Load Model (Remove max_pixels here)
model, tokenizer = FastVisionModel.from_pretrained(
    model_name = "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    load_in_4bit = True,
    max_seq_length = 32768,
)

# 2. Re-load the processor with pixel limits if needed
# (The 'tokenizer' returned by Unsloth is actually the Processor for vision models)
# You can also set these during inference, but setting them here is cleaner.
min_pixels = 256 * 28 * 28
max_pixels = 512 * 512 # or 1280 * 28 * 28 for higher quality

# Set to inference mode
FastVisionModel.for_inference(model)

print("Qwen2-VL 7B Base model loaded successfully.")

==((====))==  Unsloth 2026.1.3: Fast Qwen2_Vl patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 5090. Num GPUs = 1. Max memory: 31.367 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.11.0.dev20260119+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Qwen2-VL 2B Base model loaded successfully.


In [None]:
import json
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

# Update the output filename to distinguish it from your fine-tuned results
INPUT_FILE = "Diagnose_dataset.json"
OUTPUT_FILE = "BaseModel_predictions_output.json"

def download_image(url):
    try:
        response = requests.get(url, timeout=10)
        return Image.open(BytesIO(response.content)).convert("RGB")
    except Exception as e:
        return None

with open(INPUT_FILE, 'r') as f:
    data = json.load(f)

for key, item in tqdm(data.items()):
    if item.get("base_model_answer"): continue

    instruction = item["question"]["Text"] + " What would be the most likely disease diagnosis(es) for this patient?"
    image_urls = eval(item["question"]["ImageList"]) 
    
    images = [download_image(url) for url in image_urls]
    images = [img for img in images if img is not None]

    if not images: continue

    # Qwen2-VL prompt format
    messages = [
        {
            "role": "user",
            "content": [
                *([{"type": "image"} for _ in range(len(images))]),
                {"type": "text", "text": instruction},
            ],
        },
    ]
    
    # Use the processor (tokenizer) to handle the images correctly
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = tokenizer(
        images = images,
        text = input_text,
        add_special_tokens = False,
        return_tensors = "pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens = 512,
            use_cache = True,
            temperature = 0.2,
            top_p = 0.9,
            # Ensure the model knows when to stop
            pad_token_id = tokenizer.tokenizer.pad_token_id,
            eos_token_id = tokenizer.tokenizer.eos_token_id,
        )

    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Qwen-specific cleaning
    final_answer = prediction.split("assistant\n")[-1].strip()
    item["base_model_answer"] = final_answer

    if int(key) % 10 == 0:
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(data, f, indent=4)

with open(OUTPUT_FILE, 'w') as f:
    json.dump(data, f, indent=4)

  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
  torch._check_is_size(blocksize)
100%|██████████| 16/16 [02:10<00:00,  8.16s/it]
