In [2]:
import os
import numpy as np
import pandas as pd
import csv
import base64
import time
from prompt import *
from openai import OpenAI

# load prompt_df

In [5]:
prompt_df=pd.read_csv('Data/prompt_df.csv')
prompt_df.head()


Unnamed: 0,index,question,user_prompt,valid_options,answer,groundtruth,image_path
0,985,The image represents a standard English keyboa...,Question: The image represents a standard Engl...,"{'A': 'arsenal', 'B': 'barcelona', 'C': 'arssn...",A,arsenal,/home/sky5341/src/DetectReason/LMUData/images/...
1,10828,Decipher the characters represented in the image.,Question: Decipher the characters represented ...,"{'A': 'ADOPRSSW', 'B': 'ROMANTIC', 'C': 'PEACE...",A,ADOPRSSW,/home/sky5341/src/DetectReason/LMUData/images/...
2,40927,Solve the numeric code based on the image (The...,Question: Solve the numeric code based on the ...,"{'A': '314', 'B': '433', 'C': '519', 'D': '567'}",C,519,/home/sky5341/src/DetectReason/LMUData/images/...
3,53404,There was a car accident. A person was hit at ...,Question: There was a car accident. A person w...,"{'A': 'Car A', 'B': 'Bus B', 'C': 'Bus C', 'D'...",C,Bus C,/home/sky5341/src/DetectReason/LMUData/images/...
4,56803,"The image shows a simple two-layer cipher, one...",Question: The image shows a simple two-layer c...,"{'A': 'THANKYOU', 'B': 'THINKYOU', 'C': 'CATCH...",A,THANKYOU,/home/sky5341/src/DetectReason/LMUData/images/...


# openai

## o4-mini

In [None]:
def run_o4mini_vision_reasoning(
    prompt_df,
    model,
    system_prompt,
    save_path,
    api_key='',
    sleep_time=1.5
):
    client = OpenAI(api_key=api_key)

    if os.path.exists(save_path):
        results_df = pd.read_csv(save_path)
        done_indices = set(results_df["index"])
    else:
        results_df = pd.DataFrame(columns=["index", "question", "answer", "groundtruth", "raw_response"])
        done_indices = set()

    for i, row in prompt_df.iterrows():
        idx = row["index"]
        if idx in done_indices:
            continue

        user_prompt = row['user_prompt']
        #image_url = row['image_url']
        image_path=row['image_path']
        with open(image_path, "rb") as img_file:
            base64_img = base64.b64encode(img_file.read()).decode('utf-8')
            image_url = f"data:image/jpeg;base64,{base64_img}"

        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                {"role": "system","content": system_prompt},
                { "role": "user", "content": [
                            {"type": "text", "text": user_prompt},
                            {"type": "image_url", "image_url": {'url':image_url}}
                        ]
                    }
                ]
            )
            raw_response = response.choices[0].message.content

            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": raw_response
            }

        except Exception as e:
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": f"[ERROR] {e}"
            }

        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df = results_df[~results_df['raw_response'].str.contains("invalid_image_url", na=False)]
        results_df.to_csv(save_path, index=False)
        print(f"[{i+1}/{len(prompt_df)}] Saved index {idx}")
        time.sleep(sleep_time)

## gpt-model

In [None]:
def run_openai_vision_reasoning(
    prompt_df,
    model="gpt-4o",
    system_prompt="",
    api_key=openai_key,
     save_path="",
    sleep_time=0.5  # to respect rate limits
):
    client = OpenAI(api_key=api_key)
    if os.path.exists(save_path):
        results_df = pd.read_csv(save_path)
        done_indices = set(results_df["index"])
    else:
        results_df = pd.DataFrame(columns=["index", "question", "answer", "groundtruth", "raw_response"])
        done_indices = set()

    results = []
    for i, row in prompt_df.iterrows():
        idx = row["index"]
        if idx in done_indices:
            continue
        user_prompt = row['user_prompt']
        # image_url = row['image_url']
        image_path=row['image_path']
        with open(image_path, "rb") as img_file:
            base64_img = base64.b64encode(img_file.read()).decode('utf-8')
            image_url = f"data:image/jpeg;base64,{base64_img}"
        try:
            response = client.responses.create(
                model=model,
                instructions=system_prompt,
                input=[
                    {"role": "user",
                        "content": [
                            {"type": "input_text", "text": user_prompt},
                            {"type": "input_image", "image_url": image_url}
                        ]
                    }
                ]
            )
            raw_response = response.to_dict()["output"][0]["content"][0]["text"]
            #print(raw_response)

            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": raw_response
            }

        except Exception as e:
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": str(e)
            }
        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(save_path, index=False)
        time.sleep(sleep_time)

In [None]:
run_openai_vision_reasoning(
    prompt_df,
    model="gpt-4o-mini",#"gpt-4.1-mini",
    system_prompt=SYSTEM_PROMPT_CoT,
    api_key=openai_key,
    save_path="openai_4o_cot_results.csv",
    sleep_time=1.0  # to respect rate limits
)

# Qwen

In [None]:
!vllm serve Qwen/Qwen2.5-VL-3B-Instruct --port 8000 --host 0.0.0.0 --dtype bfloat16 --limit-mm-per-prompt image=1

In [None]:
client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:8000/v1"
)


In [None]:
def run_qwen_reasoning(
    prompt_df,
    model="Qwen/Qwen2.5-VL-3B-Instruct",
    system_prompt="",
    save_path="qwen_results.csv"
):
    if os.path.exists(save_path):
        results_df = pd.read_csv(save_path)
        done_indices = set(results_df["index"])
    else:
        results_df = pd.DataFrame(columns=["index", "question", "answer", "groundtruth", "raw_response"])
        done_indices = set()

    for i, row in prompt_df.iterrows():
        idx = row["index"]
        if idx in done_indices:
            continue

        user_prompt = row["user_prompt"]
        # image_url = row["image_url"]
        image_path=row['image_path']
        with open(image_path, "rb") as img_file:
            base64_img = base64.b64encode(img_file.read()).decode('utf-8')
            image_url = f"data:image/jpeg;base64,{base64_img}"

        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {
                        "role": "user",
                        "content": [
                            {"type": "image_url", "image_url": {"url": image_url}},
                            {"type": "text", "text": system_prompt + user_prompt}
                        ]
                    }
                ]
            )
            raw_response = response.choices[0].message.content.strip()
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": raw_response
            }

        except Exception as e:
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": str(e)
            }

        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(save_path, index=False)

In [None]:
run_qwen_reasoning(
    prompt_df,
    model="Qwen/Qwen2.5-VL-3B-Instruct",
    system_prompt=SYSTEM_PROMPT_CoT,
    save_path="qwen_3b_cot_results.csv")

# LLama-11b-vision-instruct

In [None]:
import os
import time
import torch
import pandas as pd
from PIL import Image
import requests
from transformers import MllamaForConditionalGeneration, AutoProcessor

Load model and processor
model_id = "unsloth/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(model_id)



In [None]:

def run_llama_chat_template_inference(
    prompt_df,
    system_prompt,
    save_path="llama_chat_results.csv"
):
    if os.path.exists(save_path):
        results_df = pd.read_csv(save_path)
        done_indices = set(results_df["index"])
    else:
        results_df = pd.DataFrame(columns=["index", "question", "answer", "groundtruth", "raw_response"])
        done_indices = set()

    for i, row in prompt_df.iterrows():
        idx = row["index"]
        if idx in done_indices:
            continue

        try:
            user_prompt = row["user_prompt"]
            # image_url = row["image_url"]
            image_path=row['image_path']
            with open(image_path, "rb") as img_file:
                base64_img = base64.b64encode(img_file.read()).decode('utf-8')
                image_url = f"data:image/jpeg;base64,{base64_img}"
            messages = [
                # {"role": "system", "content": system_prompt},
                {"role": "user", "content": [{"type": "image", "url": image_url}, {"type": "text", "text": system_prompt + user_prompt}]}
            ]
            # Tokenize with chat template
            inputs = processor.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt",
                return_dict=True
            ).to(model.device)
            # print(inputs)
            # inputs = {k: v.to(model.device) for k, v in inputs.items()}
            output = model.generate(**inputs, max_new_tokens=8192)
            
            decoded_output = processor.decode(output[0], skip_special_tokens=True).strip()
            
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": decoded_output
            }

        except Exception as e:
            new_row = {
                "index": idx,
                "question": row["question"],
                "answer": row["answer"],
                "groundtruth": row["groundtruth"],
                "raw_response": str(e)
            }

        results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
        results_df.to_csv(save_path, index=False)

In [None]:
run_llama_chat_template_inference(
    prompt_df,
    SYSTEM_PROMPT_CoT,
    save_path="llama_11b_cot.csv")