In [None]:
!pip uninstall -y transformers
!pip install git+https://github.com/huggingface/transformers

!pip install accelerate

!pip install qwen-vl-utils

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os
import shutil
from tqdm import tqdm

zip_path = '/content/drive/MyDrive/OmniMedVQA/Images.zip'
extract_dir = '/content/Images'

os.makedirs(extract_dir, exist_ok=True)
with zipfile.ZipFile(zip_path, 'r') as zipf:
    file_list = zipf.namelist()
    for member in tqdm(file_list, desc="Unzipping images"):
        if member.endswith('/'):
            continue
        if member.startswith('Images/'):
            target = member[len('Images/'):]
        else:
            target = member
        target_path = os.path.join(extract_dir, target)
        os.makedirs(os.path.dirname(target_path), exist_ok=True)
        with zipf.open(member) as source, open(target_path, "wb") as dest:
            shutil.copyfileobj(source, dest)

Unzipping images: 100%|██████████| 6675/6675 [00:09<00:00, 690.70it/s] 


In [None]:
import torch
import numpy as np
import random

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import pandas as pd
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, GenerationConfig
from qwen_vl_utils import process_vision_info
import torch
from tqdm import tqdm
import os
import re

# Paths
csv_dir = '/content/drive/MyDrive/OmniMedVQA/CSV_Files'
csv_paths = {
    'MRI': f"{csv_dir}/test_mri.csv",
    'CT': f"{csv_dir}/test_ct.csv",
    'X-ray': f"{csv_dir}/test_xray.csv"
}

# Load model and processor
MODEL_PATH = 'JZPeterPan/MedVLM-R1'
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map='auto',
)
processor = AutoProcessor.from_pretrained(MODEL_PATH)

generation_config = GenerationConfig(
    max_new_tokens=128,
    do_sample=False,
    temperature=1,
    num_return_sequences=1,
    pad_token_id=151643,
)

# Prompt & CoT functions
def construct_prompt(row):
    return (
        f"{row['question']}\n"
        f"A) {row['option_A']}\n"
        f"B) {row['option_B']}\n"
        f"C) {row['option_C']}\n"
        f"D) {row['option_D']}\n"
        "First, think through the question step by step, enclose your reasoning process in <think>...</think> tags. "
        "Then provide the correct single-letter choice (A, B, C, D,...) inside <answer>...</answer> tags."
    )

def extract_answer(text):
    match = re.search(r"<answer>\s*([A-D])\s*</answer>", text)
    return match.group(1) if match else "?"

def generate_cot(image_path, question_text):
    try:
        message = [{
            'role': 'user',
            'content': [
                {'type': 'image', 'image': f'file://{image_path}'},
                {'type': 'text', 'text': question_text}
            ]
        }]
        text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(message)
        if not image_inputs or image_inputs[0] is None:
            return "ERROR: No valid image"
        inputs = processor(
            text=text,
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors='pt',
        ).to(model.device)
        with torch.no_grad():
            generated_ids = model.generate(**inputs, use_cache=True, generation_config=generation_config)
        output_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        decoded = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        return decoded[0]
    except Exception as e:
        return f"ERROR: {type(e).__name__}: {str(e)}"

# Process each modality
for modality, path in csv_paths.items():
    print(f"\n--- Processing {modality} test set ---")

    df = pd.read_csv(path)
    df['abs_image_path'] = df['image_path'].apply(lambda x: '/content/' + x if not x.startswith('/content/') else x)
    df = df[df['abs_image_path'].apply(os.path.isfile)].reset_index(drop=True)

    predictions = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"CoT: {modality}"):
        img_path = row['abs_image_path']
        prompt = construct_prompt(row)
        cot = generate_cot(img_path, prompt)
        pred = extract_answer(cot)

        # Map predicted letter to answer text
        option_map = {
            'A': row['option_A'],
            'B': row['option_B'],
            'C': row['option_C'],
            'D': row['option_D'],
        }
        pred_answer_text = option_map.get(pred, "").strip().lower()
        gt_answer_text = str(row['gt_answer']).strip().lower()
        is_correct = pred_answer_text == gt_answer_text

        predictions.append((cot, pred, pred_answer_text, gt_answer_text, is_correct))

        if idx < 3:
            print(f"\n--- Sample {idx+1} ---")
            print(f"Q: {row['question']}")
            print(f"Predicted: {pred} -> {pred_answer_text}")
            print(f"Ground Truth: {gt_answer_text}")
            print(f"Correct: {is_correct}")
            print("Output:\n", cot)

    df['cot_output'] = [x[0] for x in predictions]
    df['prediction'] = [x[1] for x in predictions]
    df['pred_answer_text'] = [x[2] for x in predictions]
    df['gt_answer_text'] = [x[3] for x in predictions]
    df['is_correct'] = [x[4] for x in predictions]
    accuracy = sum(df['is_correct']) / len(df) * 100

    print(f"\nAccuracy on {modality} (n={len(df)}): {accuracy:.2f}%")

    out_path = path.replace(".csv", "_with_cot_accuracy.csv")
    df.to_csv(out_path, index=False)
    print(f"Saved annotated results to: {out_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. 


--- Processing MRI test set ---


CoT: MRI:   0%|          | 0/300 [00:00<?, ?it/s]`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'use_cache': None, 'temperature': 0.01, 'top_k': 1, 'top_p': 0.001, 'bos_token_id': 151643, 'eos_token_id': [151645, 151643], 'attn_implementation': 'flash_attention_2'}. If this is not desired, please set these values explicitly.
CoT: MRI:   0%|          | 1/300 [00:06<34:12,  6.86s/it]


--- Sample 1 ---
Q: What can be observed in this image?
Predicted: A -> bone inflammation.
Ground Truth: bone inflammation.
Correct: True
Output:
 <think>
The image is a magnetic resonance imaging (MRI) scan of a knee joint. The scan shows a clear view of the knee, including the patella (knee cap), the femoral condyles (the flat surfaces of the femur bone), and the patellar ligament.
</think>

<answer>A</answer>


CoT: MRI:   1%|          | 2/300 [00:10<25:33,  5.15s/it]


--- Sample 2 ---
Q: What can be observed in this image?
Predicted: D -> achilles tendonitis
Ground Truth: deltoid pathology
Correct: False
Output:
 <think>
The image is a magnetic resonance imaging (MRI) scan of a foot, specifically focusing on the Achilles tendon. The scan shows a thickened and inflamed appearance of the Achilles tendon, which is a common finding in Achilles tendonitis.
</think>

<answer>D</answer>


CoT: MRI:   1%|          | 3/300 [00:14<22:55,  4.63s/it]


--- Sample 3 ---
Q: What can be observed in this image?
Predicted: D -> foraminal pathology
Ground Truth: foraminal pathology
Correct: True
Output:
 <think>
The image is a magnetic resonance imaging (MRI) scan of the lumbar spine. The scan shows a bony structure with a central area of high signal intensity, which is characteristic of an epidural hematoma.
</think>

<answer>D</answer>


CoT: MRI: 100%|██████████| 300/300 [23:03<00:00,  4.61s/it]



Accuracy on MRI (n=300): 94.67%
Saved annotated results to: /content/drive/MyDrive/OmniMedVQA/CSV_Files/test_mri_with_cot_accuracy.csv

--- Processing CT test set ---


CoT: CT:   0%|          | 1/300 [00:04<21:04,  4.23s/it]


--- Sample 1 ---
Q: What is the term referring to the abnormality observed in the image?
Predicted: A -> airspace opacity
Ground Truth: airspace opacity
Correct: True
Output:
 <think>
The image shows a CT scan of the chest, which is commonly used to diagnose various lung conditions. The abnormality observed in the image appears as a consolidation, which is a term used to describe a localized area of increased lung density.
</think>

<answer>A</answer>


CoT: CT:   1%|          | 2/300 [00:07<19:25,  3.91s/it]


--- Sample 2 ---
Q: Is there any presence of abnormalities observed in this image?
Predicted: A -> yes.
Ground Truth: no
Correct: False
Output:
 <think>
The image appears to be a cross-sectional view of a medical scan, likely a CT (Computed Tomography) scan of the chest. The scan shows a dense area in the center, which could be interpreted as a mass or abnormality.
</think>

<answer>A</answer>


CoT: CT:   1%|          | 3/300 [00:13<23:29,  4.75s/it]


--- Sample 3 ---
Q: What abnormal finding can be observed in the radiograph?
Predicted: D -> osseous pathology
Ground Truth: arterial pathology
Correct: False
Output:
 <think>
The radiograph shows a cross-sectional view of the abdomen, which is a common imaging technique used in medical diagnostics. The presence of various anatomical structures such as the liver, spleen, and intestines suggests that the radiograph is likely an abdominal CT scan.
</think>

<answer>D</answer>


CoT: CT: 100%|██████████| 300/300 [37:42<00:00,  7.54s/it]



Accuracy on CT (n=300): 76.67%
Saved annotated results to: /content/drive/MyDrive/OmniMedVQA/CSV_Files/test_ct_with_cot_accuracy.csv

--- Processing X-ray test set ---


CoT: X-ray:   0%|          | 1/300 [00:11<55:13, 11.08s/it]


--- Sample 1 ---
Q: What type of abnormality is depicted in this image?
Predicted: B -> no, it's normal
Ground Truth: no, it's normal
Correct: True
Output:
 <think>
The image is a mammogram, which is a type of X-ray used to detect abnormalities in the breast. The mammogram shows the breast tissue in detail, and the presence of dense tissue and calcifications is typical.
</think>

<answer>B</answer>


CoT: X-ray:   1%|          | 2/300 [00:19<46:17,  9.32s/it]


--- Sample 2 ---
Q: What imaging technique is employed to obtain this picture?
Predicted: C -> x_ray.
Ground Truth: x_ray.
Correct: True
Output:
 <think>
The image is a radiographic view of a knee, which is a common imaging technique used to visualize the bones and soft tissues of the body. The clarity and structure of the bones and soft tissues are indicative of a radiographic procedure.
</think>

<answer>C</answer>


CoT: X-ray:   1%|          | 3/300 [00:24<36:33,  7.39s/it]


--- Sample 3 ---
Q: What modality is used to capture this image?
Predicted: A -> x_ray.
Ground Truth: x_ray.
Correct: True
Output:
 <think>
The image appears to be a radiographic image, likely taken using a radiographic modality. The clarity and structure suggest it is a medical imaging technique used to visualize internal structures.
</think>

<answer>A</answer>


CoT: X-ray: 100%|██████████| 300/300 [55:08<00:00, 11.03s/it]


Accuracy on X-ray (n=300): 77.33%
Saved annotated results to: /content/drive/MyDrive/OmniMedVQA/CSV_Files/test_xray_with_cot_accuracy.csv



