In [1]:
import os
import requests
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset
import transformers
from peft import get_peft_model, PrefixTuningConfig, TaskType
from transformers import Trainer, TrainingArguments
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from transformers import AutoTokenizer
import numpy as np
from torchvision import transforms
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import re
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
from matplotlib import pyplot as plt
import torch

In [None]:
# import requests
# from PIL import Image
# from transformers import BlipProcessor, BlipForQuestionAnswering
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
# model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")


In [None]:
model = InstructBlipForConditionalGeneration.from_pretrained(
    "Salesforce/instructblip-vicuna-7b",
    torch_dtype=torch.float16,
).to(device)
processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

In [None]:
with open("split_result/test_zeroshot.jsonl", 'r') as fr:
  arxiv_qa = [json.loads(line.strip()) for line in fr]
  
test_dataset = []
for i in range(len(arxiv_qa)):
    sample = arxiv_qa[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        test_dataset.append({
            'id':sample['id'],
            "image": sample['image'], 
            "question": sample["question"],
            "choices": sample["options"],
            "answer": sample["label"],
            "solution": sample["rationale"]
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue

In [None]:
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": "/root/IC_MLLM_VQA/ArxivQA/pure_testing/"+sample['image']}) #绝对路径
    
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    
    question_text += "explain why in steps."
    content.append({"type": "text", "text": question_text})
    
    return [{"role": "user", "content": content}]

In [None]:
from torchvision import transforms
for sample in test_dataset:
    messages = build_message(sample)
    image_path = '/root/IC_MLLM_VQA/ArxivQA/pure_testing/'+sample["image"]
    image_inputs = Image.open(image_path).convert("RGB")
    break
# print (messages)
text_prompt = "\n".join([item["text"] for item in messages[0]["content"] if item["type"] == "text"])
print(text_prompt)
plt.imshow(image_inputs)
plt.axis("off")
plt.title(sample["image"])
plt.title(sample["image"])
plt.show()

In [None]:
# 映射选项
answer_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3,'E': 4}

# 提取选项字母的函数（支持 C、c、C.、C) 等）
def extract_choice_letter(ans):
    if not isinstance(ans, str):
        return None
    ans = ans.strip()
    match = re.match(r"([A-Da-d])[\.\)]?", ans)
    return match.group(1).upper() if match else None

# 构建模型输入消息
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    question_text += "Please select the correct answer and explain why."
    content.append({"type": "text", "text": question_text})
    return [{"role": "user", "content": content}]

# 解析模型输出（返回编号+解释）
def parse_output(output):
    output = output.strip()
    answer_match = re.search(r"\b([A-Da-d])[\.\:\)]?", output)
    answer_letter = answer_match.group(1).upper() if answer_match else None
    answer = answer_mapping.get(answer_letter, -1)
    explanation = ""
    if answer != -1:
        idx = output.find(answer_match.group(0))
        if idx != -1:
            explanation = output[idx + len(answer_match.group(0)):].strip()
    return answer, explanation

# 初始化评估工具
rouge = Rouge()
smoothie = SmoothingFunction().method1
vectorizer = CountVectorizer(stop_words="english").fit([s["solution"] for s in test_dataset])
keywords = set(vectorizer.get_feature_names_out())

# 记录结果
all_records = []
subset = test_dataset[0:2]
for idx, sample in enumerate(tqdm(test_dataset, desc="Evaluating", ncols=100, leave=False)):
    # 1. 构建图文输入
    messages = build_message(sample)
    text_prompt = "\n".join([item["text"] for item in messages[0]["content"] if item["type"] == "text"])
    image_path = '/root/IC_MLLM_VQA/ArxivQA/pure_testing/' + sample["image"]
    image = Image.open(image_path).convert("RGB").resize((160, 160))
    
    inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(device)
    # 3. 生成输出
    generated_ids = model.generate(
        **inputs,
        do_sample=False,
        num_beams=5,
        max_length=512,
        min_length=1,
        top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1.0,
    )

    # 4. 解码输出
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    # 5. 提取答案和解释
    pred_answer_id, pred_explanation = parse_output(output)
# for idx, sample in enumerate(tqdm(test_dataset, desc="Evaluating", ncols=100, leave=False)):
#     # 构建输入
#     messages = build_message(sample)
#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     image_path = '/root/IC_MLLM_VQA/ArxivQA/pure_testing/'+sample["image"]
#     image_inputs = Image.open(image_path).convert("RGB")

#     inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to("cuda")
#     generated_ids = model.generate(**inputs, max_new_tokens=512)
#     output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

#     # 解析模型输出
#     pred_answer_id, pred_explanation = parse_output(output)

    # 计算 BLEU
    if pred_explanation:
        bleu = sentence_bleu([sample["solution"].split()], pred_explanation.split(), smoothing_function=smoothie)
    else:
        bleu = 0.0

    # 计算 ROUGE-L
    try:
        rouge_score = rouge.get_scores(pred_explanation, sample["solution"])[0]["rouge-l"]["f"]
    except:
        rouge_score = 0.0

    # 计算关键词重叠
    gt_tokens = set(sample["solution"].lower().split())
    pred_tokens = set(pred_explanation.lower().split())
    overlap = len(gt_tokens & pred_tokens & keywords)
    keyword_score = overlap / max(len(gt_tokens & keywords), 1)

    # 提取真实答案
    true_letter = extract_choice_letter(sample["answer"])
    true_answer_id = answer_mapping.get(true_letter, -1)

    # 存入结果
    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": sample["answer"],
        "True Answer ID": true_answer_id,
        "Predicted Answer ID": pred_answer_id,
        "Correct": int(true_answer_id == pred_answer_id),
        "Model Output": output,
        "Model Explanation": pred_explanation,
        "Reference Solution": sample["solution"],
        "BLEU": bleu,
        "ROUGE-L": rouge_score,
        "Keyword Overlap": keyword_score
    })

# 构建结果表格
results_df = pd.DataFrame(all_records)
results_df.to_csv("blip2-opt-2.7b_ArxivQA_testing_physics.csv", index=False, encoding="utf-8-sig")

# 汇总指标（基于编号比较准确率）
acc = accuracy_score(results_df["True Answer ID"], results_df["Predicted Answer ID"])
f1 = f1_score(results_df["True Answer ID"], results_df["Predicted Answer ID"], average='macro')
avg_bleu = results_df["BLEU"].mean()
avg_rouge = results_df["ROUGE-L"].mean()
avg_keyword_overlap = results_df["Keyword Overlap"].mean()

# 打印汇总结果
print("\n📊 Summary Metrics:")
print(f"✅ Accuracy: {acc:.4f}")
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ Avg BLEU: {avg_bleu:.4f}")
print(f"✅ Avg ROUGE-L: {avg_rouge:.4f}")
print(f"✅ Avg Keyword Overlap: {avg_keyword_overlap:.4f}")
print("\n📁 结果已保存为 blip2-opt-2.7b_ArxivQA_testing_physics.csv（UTF-8 with BOM）")

