In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

# !pip install -U transformers==4.48.3
!pip install -U transformers  # Per Qwen2.5-VL
!pip install -U accelerate
!pip install -U bitsandbytes

!pip install triton
!pip install tiktoken

!pip install flash-attn --no-build-isolation

In [None]:
# !pip install qwen-vl-utils==0.0.4
# !pip install qwen-vl-utils==0.0.8  # Per Qwen2.5-VL

# !pip install git+https://github.com/deepseek-ai/DeepSeek-VL.git  # Per DeepSeek-VL
!pip install git+https://github.com/shobhitag11/DeepSeek-VL2-Run-On-Google-Colab.git  # Per DeepSeek-VL2

In [None]:
import os
import re
import json
import torch
import requests
import argparse
import pandas as pd

from PIL import Image
from tqdm import tqdm
from transformers.image_utils import load_image
from transformers import (
    pipeline,
    # PaliGemmaProcessor,
    # PaliGemmaForConditionalGeneration,
    # BitsAndBytesConfig,
    # Qwen2VLForConditionalGeneration,
    # Qwen2_5_VLForConditionalGeneration,  # Per Qwen2.5-VL
    AutoModelForCausalLM,
    AutoTokenizer, AutoProcessor
)

# from qwen_vl_utils import process_vision_info

from deepseek_vl2.models import DeepseekVLV2Processor
from deepseek_vl2.utils.io import load_pil_images

# from deepseek_vl.models import VLChatProcessor
# from deepseek_vl.utils.io import load_pil_images

Python version is above 3.10, patching the collections module.


## LLaVa-1.5-7B-HF - Qwen2.5-VL-7B-Instruct - Phi-3.5-vision-instruct

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ''
    return str(text).strip().lower().replace('\n', ' ').replace(";", "").replace('"', "")

def model_generation(model_name, content, image_url):

    if model_name == "llava-hf/llava-1.5-7b-hf" or model_name == "Qwen/Qwen2-VL-7B-Instruct" \
        or model_name == "Qwen/Qwen2.5-VL-7B-Instruct":
        messages = [
            {
              "role": "user",
              "content": [
                  {"type": "image", "url": f"{image_url}"},
                  {"type": "text", "text": content},
                ],
            },
        ]

        # Generate text based on image and prompt
        outputs = image_text_pipeline(text=messages, max_new_tokens=100)
        generated_text = outputs[0]["generated_text"][-1]['content']
        match = re.search(r'{\s*"answer"\s*:\s*"[A-Za-z]"\s*}', generated_text, re.DOTALL)
        if match:
            generated_text = match.group(0)
        else:
            generated_text = '{"answer": "' + "{}".format(generated_text[0]) + '"}'

    elif model_name == "deepseek-ai/deepseek-vl-7b-chat" or model_name == "deepseek-ai/deepseek-vl2-tiny":
        raw_image = requests.get(image_url, stream=True).content
        with open('image.png', 'wb') as handler:
            handler.write(raw_image)

        if model_name.endswith("deepseek-vl2-tiny"):
            messages = [
                {
                    "role": "<|User|>",
                    "content": f"<image>\n<|ref|>{content}<|/ref|>",
                    "images": ["./image.png"]
                },
                {
                    "role": "<|Assistant|>",
                    "content": ""
                }
            ]
            vlm = model
        else:
            messages = [
                {
                    "role": "User",
                    "content": f"<image_placeholder>\n{content}",
                    "images": ["./image.png"]
                },
                {
                    "role": "Assistant",
                    "content": ""
                }
            ]
            vlm = model.language_model


        # load images and prepare for inputs
        pil_images = load_pil_images(messages)
        prepare_inputs = processor(
            conversations=messages,
            images=pil_images,
            force_batchify=True
        ).to(model.device, dtype=torch.float16)

        # run image encoder to get the image embeddings
        inputs_embeds = model.prepare_inputs_embeds(**prepare_inputs)

        # run the model to get the response
        outputs = vlm.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=prepare_inputs.attention_mask,
            pad_token_id=processor.tokenizer.eos_token_id,
            bos_token_id=processor.tokenizer.bos_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            max_new_tokens=100,
            do_sample=False,
            use_cache=True
        )

        generated_text = processor.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
        print(generated_text)
        match = re.search(r'{\s*"answer"\s*:\s*"[A-Za-z]"\s*}', generated_text, re.DOTALL)
        if match:
            generated_text = match.group(0)
        else:
            generated_text = '{"answer": "' + "{}".format(generated_text[0]) + '"}'

    elif model_name == "microsoft/Phi-3.5-vision-instruct":
        raw_image = Image.open(requests.get(image_url, stream=True).raw)
        messages = [
            {"role": "user", "content": f"<|image_1|>\n {content}\n ASSISTANT:"},
        ]

        prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(prompt, [raw_image], return_tensors="pt").to(model.device)

        generation_args = {"max_new_tokens": 100, "do_sample": False}
        generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

        # remove input tokens
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        generated_text = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        match = re.search(r'{\s*"answer"\s*:\s*"[A-Za-z]"\s*}', generated_text, re.DOTALL)
        if match:
            generated_text = match.group(0)
        else:
            generated_text = '{"answer": "' + "{}".format(generated_text) + '"}'

    print(generated_text)
    return generated_text

def process_csv(df, csv_path, model_name, json_filename, device):
    # Check if all required columns exist
    required_columns = ['Category', 'Question', 'Image url', 'AnswerA', 'AnswerB',
                        'AnswerC', 'AnswerD', 'AnswerE', 'Correct Answer']

    if not all(col in df.columns for col in required_columns):
        print(f"Error: One or more required columns are missing in csv '{csv_path}'.")
        return

    # Ensure the JSON file exists
    if not os.path.exists(json_filename):
        with open(json_filename, 'w') as f:
            json.dump([], f)  # Initialize with an empty list

    for i, row in tqdm(df.iterrows(), total=len(df)):
        category = row['Category']
        question = row['Question']
        image_url = row['Image url']
        answers = {k: clean_text(row[k]) for k in ['AnswerA', 'AnswerB', 'AnswerC', 'AnswerD', 'AnswerE']}
        answer2key = {v: k[-1] for k, v in answers.items()}  # Extract letter from key (e.g., 'A', 'B', ...)
        correct_answer_text = clean_text(row['Correct Answer'])

        try:
            correct_answer_key = answer2key[correct_answer_text]
        except:
            print("ERROR!")
            print("   Question: ", question)
            print("   Answers: ", answers)
            print("   Correct answer: ", correct_answer_text)
            continue

        # If there is an image URL, process with the image-to-text model
        if image_url != "":
            try:
                # image = Image.open(requests.get(image_url, stream=True).raw)
                content = f"""You are a medical student who must answer a multiple-choice test.\n
                Given a medical image and a question related to {category}, choose the correct answer from the options.\n
                Question: {question}\n
                A: {answers['AnswerA']}\n
                B: {answers['AnswerB']}\n
                C: {answers['AnswerC']}\n
                D: {answers['AnswerD']}\n
                E: {answers['AnswerE']}\n
                You MUST return an answer EXACTLY in JSON format: {{"answer": "letter"}}.\n
                In ANY CASE, assign a letter equal to the most appropriate option among those provided.\n
                Do not make arguments or reasoning in your response.
                """

                generated_text = model_generation(model_name, content, image_url)

            except Exception as e:
                print(f"Error processing image for question '{question}': {e}")
                continue

        try:
            # Convert the generated text to a dictionary
            response_dict = json.loads(generated_text)
            model_answer = response_dict.get('answer', '').upper()
            is_correct = model_answer == correct_answer_key.upper()

            result = {
                'Category': category,
                'Task': 'Multimodal' if image_url else 'Text',
                'Model': model_name,
                'Question': question,
                'Image URL': image_url,
                'Answer A': answers['AnswerA'],
                'Answer B': answers['AnswerB'],
                'Answer C': answers['AnswerC'],
                'Answer D': answers['AnswerD'],
                'Answer E': answers['AnswerE'],
                'Correct Answer': correct_answer_key.upper(),
                'Model Answer': model_answer,
                'Is Correct': is_correct,
            }

            # Append result to JSON file
            with open(json_filename, 'r+', encoding='utf-8') as f:
                data = json.load(f)
                data.append(result)
                f.seek(0)
                json.dump(data, f, ensure_ascii=False, indent=4)

        except Exception as e:
            print(f"Error processing model's output: {e}")

    print(f'Updated results for sheet "{category}" in {json_filename}')

def initialize_model(model_name, device):
    global image_text_pipeline, model, processor
    image_text_pipeline, model, processor = None, None, None
    print("Initializing the model...")

    # quantization_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_compute_dtype=torch.float16
    # )

    # Determine the model type based on the task
    if model_name == "llava-hf/llava-1.5-7b-hf" or model_name == "Qwen/Qwen2-VL-7B-Instruct" \
        or model_name == "Qwen/Qwen2.5-VL-7B-Instruct":
        image_text_pipeline = pipeline("image-text-to-text", model=model_name,
                                       model_kwargs={"quantization_config": quantization_config})

    elif model_name == "deepseek-ai/deepseek-vl-7b-chat" or model_name == "deepseek-ai/deepseek-vl2-tiny":
        if model_name.endswith("deepseek-vl2-tiny"):
            processor = DeepseekVLV2Processor.from_pretrained(model_name)
        else:
            processor = VLChatProcessor.from_pretrained(model_name)

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            # quantization_config=quantization_config,
            trust_remote_code=True,
            torch_dtype=torch.float16
        ).cuda().eval()

    elif model_name == "microsoft/Phi-3.5-vision-instruct":
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            trust_remote_code=True,
            _attn_implementation='eager'
        )
        processor = AutoProcessor.from_pretrained(
            model_name,
            trust_remote_code=True
        )

    print("Model initialized.")
    return image_text_pipeline, model, processor

def main(csv_path, colab_path, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    image_text_pipeline, model, processor = initialize_model(model_name, device)

    df = pd.read_csv(csv_path)
    model_spec = model_name.split("/")[-1]

    json_filename = f"{colab_path}TASK - MULTIMODAL/LLM/{csv_path.split('/')[-1].split('.')[0]}_{model_spec}_MC.json"
    if os.path.exists(json_filename):
        print(f"Skipping csv '{csv_path.split('/')[-1]}' as output file '{json_filename}' already exists.")
    else:
        print(f"Processing csv '{csv_path.split('/')[-1]}'...")
        process_csv(df, csv_path.split('/')[-1], model_name, json_filename, device)

In [None]:
colab_path = "/content/drive/MyDrive/Benchmarking LLM/"
csv_path = f"{colab_path}Datasets/Multimodal/MIR_Multimodal_Dataset_English_Prepro.csv"
model_name = "deepseek-ai/deepseek-vl2-tiny"

main(csv_path, colab_path, model_name)

## Paligemma 2 - 3B Mix 448x448

In [None]:
def clean_text(text):
    if pd.isna(text):
        return ''
    return str(text).strip().lower().replace('\n', ' ').replace(";", "").replace('"', "")

def process_csv(df, csv_name, model_name, json_filename, device):
    required_columns = ['Category', 'Question', 'Image url', 'AnswerA', 'AnswerB',
                        'AnswerC', 'AnswerD', 'AnswerE', 'Correct Answer']

    if not all(col in df.columns for col in required_columns):
        print(f"Error: One or more required columns are missing in sheet '{csv_name}'.")
        return

    if not os.path.exists(json_filename):
        with open(json_filename, 'w') as f:
            json.dump([], f)

    for i, row in tqdm(df.iterrows(), total=len(df)):
        category = row['Category']
        original_question = row['Question']
        image_url = row['Image url']
        answers = {k: clean_text(row[k]) for k in ['AnswerA', 'AnswerB', 'AnswerC', 'AnswerD', 'AnswerE']}
        answer2key = {v: k[-1] for k, v in answers.items()}  # Extract letter from key (e.g., 'A', 'B', ...)
        correct_answer_text = clean_text(row['Correct Answer'])

        try:
            correct_answer_key = answer2key[correct_answer_text]
        except:
            print("ERROR!")
            print("   Question: ", original_question)
            print("   Answers: ", answers)
            print("   Correct answer: ", correct_answer_text)
            continue

        if image_url:
            try:
                raw_image = load_image(image_url)

                ################ PROMPT DA CONTROLLARE ED AGGIUSTARE ################
                ################ LA LINGUA IN BASE AL DATASET #######################
                question = f"""You are a medical student who must answer a multiple-choice test.\n
                Given a medical image and a question related to {category}, choose the correct answer from the options.\n
                Question: {original_question}\n
                A: {answers['AnswerA']}\n
                B: {answers['AnswerB']}\n
                C: {answers['AnswerC']}\n
                D: {answers['AnswerD']}\n
                E: {answers['AnswerE']}\n
                You MUST return an answer in JSON format: {{"answer": "letter"}}.\n
                In ANY CASE, assign a letter equal to the most appropriate option among those provided.
                """

                lang = "en"
                prompt = f"<image> answer {lang} {question}"
                ################ FINE PROMPT ########################################

                model_inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16).to(device)
                input_len = model_inputs["input_ids"].shape[-1]

                generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
                generation = generation[0][input_len:]
                generated_text = processor.decode(generation, skip_special_tokens=True)
                # print(generated_text)

                if generated_text:
                    generated_text = '{"answer": "' + "{}".format(generated_text[0]) + '"}'
                else:
                    generated_text = '{"answer": ""}'
                print(generated_text)

            except Exception as e:
                print(f"Error processing image for question '{original_question}': {e}")
                continue

        try:
            response_dict = json.loads(generated_text)
            model_answer = response_dict.get('answer', '').upper()
            is_correct = model_answer == correct_answer_key.upper()

            result = {
                'Category': category,
                'Task': 'Multimodal' if image_url else 'Text',
                'Model': model_name,
                'Question': original_question,
                'Image URL': image_url,
                'Answer A': answers['AnswerA'],
                'Answer B': answers['AnswerB'],
                'Answer C': answers['AnswerC'],
                'Answer D': answers['AnswerD'],
                'Answer E': answers['AnswerE'],
                'Correct Answer': correct_answer_key.upper(),
                'Model Answer': model_answer,
                'Is Correct': is_correct,
            }

            with open(json_filename, 'r+', encoding='utf-8') as f:
                data = json.load(f)
                data.append(result)
                f.seek(0)
                json.dump(data, f, ensure_ascii=False, indent=4)

        except Exception as e:
            print(f"Error processing model's output: {e}")

    print(f'Updated results for sheet "{category}" in {json_filename}')

def initialize_model(model_name, device):
    global model, processor
    print("Initializing the PaliGemma model...")

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    ).to(device)
    processor = PaliGemmaProcessor.from_pretrained(model_name)

    print("Model initialized.")
    return model, processor

def main(csv_path, colab_path, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = initialize_model(model_name, device)

    df = pd.read_csv(csv_path)
    model_spec = model_name.split("/")[-1]

    json_filename = f"{colab_path}TASK - MULTIMODAL/LLM/{csv_path.split('/')[-1].split('.')[0]}_{model_spec}_MC.json"
    if os.path.exists(json_filename):
        print(f"Skipping csv '{csv_path.split('/')[-1]}' as output file '{json_filename}' already exists.")
    else:
        print(f"Processing csv '{csv_path.split('/')[-1]}'...")
        process_csv(df, csv_path.split('/')[-1], model_name, json_filename, device)


In [None]:
colab_path = "/content/drive/MyDrive/Benchmarking LLM/"
csv_path = f"{colab_path}Datasets/Multimodal/MIR_Multimodal_Dataset_Spanish_Prepro.csv"
model_name = "google/paligemma2-3b-mix-448"

main(csv_path, colab_path, model_name)

## Paligemma 2 - 10B Mix 448x448

In [None]:
colab_path = "/content/drive/MyDrive/Benchmarking LLM/"
csv_path = f"{colab_path}Datasets/Multimodal/MIR_Multimodal_Dataset_Italian.csv"
model_name = "google/paligemma2-10b-mix-448"

main(csv_path, colab_path, model_name)