In [2]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

In [3]:
data = load_dataset('derek-thomas/ScienceQA', split='test') # choose the test set

In [4]:
N = data.num_rows   # 你可以自由改这个数
# N = 10
# 构建 test_dataset
test_dataset = []

for i in range(N):
    sample = data[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        test_dataset.append({
            "image": sample.get("image", None), 
            "question": sample["question"],
            "choices": sample["choices"],
            "answer": sample["answer"],
            "solution": sample["solution"]
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue


In [8]:
# 设置设备：如果有 GPU 就用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 加载模型到 GPU
model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    device_map={"": device},      # 明确放到 cuda 或 cpu
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # GPU 使用半精度更快
)

# 加载处理器
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


# 构造消息
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    question_text += "Please select the correct answer and explain why."
    content.append({"type": "text", "text": question_text})
    return [{"role": "user", "content": content}]

# 解析模型输出
def parse_output(output):
    output = output.strip()
    answer_match = re.search(r"\b([A-D])[\.\:]", output)
    if answer_match:
        answer = ord(answer_match.group(1)) - 65
    else:
        answer = -1
    explanation = ""
    if answer != -1:
        idx = output.find(answer_match.group(0))
        if idx != -1:
            explanation = output[idx + len(answer_match.group(0)):].strip()
    return answer, explanation

# 初始化指标容器
rouge = Rouge()
smoothie = SmoothingFunction().method1
vectorizer = CountVectorizer(stop_words="english").fit([s["solution"] for s in test_dataset])
keywords = set(vectorizer.get_feature_names_out())

all_records = []

# 开始评估
for sample in tqdm(test_dataset):
    # 构建输入
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = [sample["image"]] if sample["image"] else None

    inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

    # 解析结果
    pred_answer, pred_explanation = parse_output(output)

    # 计算 BLEU
    if pred_explanation:
        bleu = sentence_bleu([sample["solution"].split()], pred_explanation.split(), smoothing_function=smoothie)
    else:
        bleu = 0.0

    # 计算 ROUGE-L
    try:
        rouge_score = rouge.get_scores(pred_explanation, sample["solution"])[0]["rouge-l"]["f"]
    except:
        rouge_score = 0.0

    # 计算关键词重叠
    gt_tokens = set(sample["solution"].lower().split())
    pred_tokens = set(pred_explanation.lower().split())
    overlap = len(gt_tokens & pred_tokens & keywords)
    keyword_score = overlap / max(len(gt_tokens & keywords), 1)

    # 存入结果
    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": sample["answer"],
        "Predicted Answer": pred_answer,
        "Model Output": output,
        "Model Explanation": pred_explanation,
        "Reference Solution": sample["solution"],
        "BLEU": bleu,
        "ROUGE-L": rouge_score,
        "Keyword Overlap": keyword_score
    })

# 构建结果表格
results_df = pd.DataFrame(all_records)

# 汇总指标
acc = accuracy_score(results_df["True Answer"], results_df["Predicted Answer"])
f1 = f1_score(results_df["True Answer"], results_df["Predicted Answer"], average='macro')
avg_bleu = results_df["BLEU"].mean()
avg_rouge = results_df["ROUGE-L"].mean()
avg_keyword_overlap = results_df["Keyword Overlap"].mean()

# 打印结果
print("\nSummary Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Avg BLEU: {avg_bleu:.4f}")
print(f"Avg ROUGE-L: {avg_rouge:.4f}")
print(f"Avg Keyword Overlap: {avg_keyword_overlap:.4f}")

# 保存为 CSV
results_df.to_csv("model_evaluation_results.csv", index=False)
print("\n✅ 结果已保存为 model_evaluation_results.csv")


Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.09s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
100%|██████████| 4241/4241 [4:33:27<00:00,  3.87s/it]  


Summary Metrics:
Accuracy: 0.7484
F1 Score: 0.5114
Avg BLEU: 0.0787
Avg ROUGE-L: 0.2971
Avg Keyword Overlap: 0.4165

✅ 结果已保存为 model_evaluation_results.csv



