In [2]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch

In [3]:
data = load_dataset('derek-thomas/ScienceQA', split='test') # choose the test set

In [None]:
# N = data.num_rows   # 你可以自由改这个数
N = 2
# 构建 test_dataset

for i in range(N):
    sample = data[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        solution = sample.get("solution", "")
        lecture = sample.get("lecture", "")
        solution_lecture = f"{solution}\n\n{lecture}".strip()

        test_dataset.append({
            "image": sample.get("image", None), 
            "question": sample["question"],
            "choices": sample["choices"],
            "hint": sample["hint"],
            
            "answer": sample["answer"],
            "solution_lecture": solution_lecture
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue



In [5]:
test_dataset

[{'image': None,
  'question': 'Which figure of speech is used in this text?\nSing, O goddess, the anger of Achilles son of Peleus, that brought countless ills upon the Achaeans.\n—Homer, The Iliad',
  'choices': ['chiasmus', 'apostrophe'],
  'hint': '',
  'answer': 1,
  'solution_lecture': 'The text uses apostrophe, a direct address to an absent person or a nonhuman entity.\nO goddess is a direct address to a goddess, a nonhuman entity.\n\nFigures of speech are words or phrases that use language in a nonliteral or unusual way. They can make writing more expressive.\nAnaphora is the repetition of the same word or words at the beginning of several phrases or clauses.\nWe are united. We are powerful. We are winners.\nAntithesis involves contrasting opposing ideas within a parallel grammatical structure.\nI want to help, not to hurt.\nApostrophe is a direct address to an absent person or a nonhuman entity.\nOh, little bird, what makes you sing so beautifully?\nAssonance is the repetition 

In [4]:
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 加载模型和处理器
model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    device_map={"": device},
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# 构造消息
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    
    if sample.get("hint"):
        question_text += f"\nHint: {sample['hint']}\n"
    
    question_text += "Please select the correct answer and explain why."
    content.append({"type": "text", "text": question_text})
    
    return [{"role": "user", "content": content}]

# 解析模型输出
def parse_output(output):
    output = output.strip()
    answer_match = re.search(r"\b([A-D])[\.\:]", output)
    if answer_match:
        answer = ord(answer_match.group(1)) - 65
    else:
        answer = -1
    explanation = ""
    if answer != -1:
        idx = output.find(answer_match.group(0))
        if idx != -1:
            explanation = output[idx + len(answer_match.group(0)):].strip()
    return answer, explanation

# 准备测试数据
N = data.num_rows
test_dataset = []

for i in range(N):
    sample = data[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        solution = sample.get("solution", "")
        lecture = sample.get("lecture", "")
        solution_lecture = f"{solution}\n\n{lecture}".strip()

        test_dataset.append({
            "image": sample.get("image", None),
            "question": sample["question"],
            "choices": sample["choices"],
            "answer": sample["answer"],  # 是选项文本
            "hint": sample.get("hint", None),
            "solution_lecture": solution_lecture
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue

# 初始化工具
rouge = Rouge()
smoothie = SmoothingFunction().method1
vectorizer = CountVectorizer(stop_words="english").fit([s["solution_lecture"] for s in test_dataset])
keywords = set(vectorizer.get_feature_names_out())

# 开始评估
all_records = []

for sample in tqdm(test_dataset):
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = [sample["image"]] if sample["image"] else None

    inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

    pred_answer, pred_explanation = parse_output(output)
    true_answer = sample["answer"]

    # BLEU-1 和 BLEU-4
    reference = sample["solution_lecture"].split()
    hypothesis = pred_explanation.split()
    bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu([reference], hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    # ROUGE-L
    try:
        rouge_score = rouge.get_scores(pred_explanation, sample["solution_lecture"])[0]["rouge-l"]["f"]
    except:
        rouge_score = 0.0

    # 关键词重叠
    gt_tokens = set(sample["solution_lecture"].lower().split())
    pred_tokens = set(pred_explanation.lower().split())
    overlap = len(gt_tokens & pred_tokens & keywords)
    keyword_score = overlap / max(len(gt_tokens & keywords), 1)

    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": true_answer,
        "Predicted Answer": pred_answer,
        "Predicted Answer Text": sample["choices"][pred_answer] if 0 <= pred_answer < len(sample["choices"]) else "N/A",
        "Model Output": output,
        "Model Explanation": pred_explanation,
        "Reference Solution": sample["solution_lecture"],
        "BLEU-1": bleu1,
        "BLEU-4": bleu4,
        "ROUGE-L": rouge_score,
        "Keyword Overlap": keyword_score
    })

# 结果表格
results_df = pd.DataFrame(all_records)

# 汇总评估
acc = accuracy_score(results_df["True Answer"], results_df["Predicted Answer"])
f1 = f1_score(results_df["True Answer"], results_df["Predicted Answer"], average='macro')
avg_bleu1 = results_df["BLEU-1"].mean()
avg_bleu4 = results_df["BLEU-4"].mean()
avg_rouge = results_df["ROUGE-L"].mean()
avg_keyword_overlap = results_df["Keyword Overlap"].mean()

# 打印汇总结果
print("\nSummary Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Avg BLEU-1: {avg_bleu1:.4f}")
print(f"Avg BLEU-4: {avg_bleu4:.4f}")
print(f"Avg ROUGE-L: {avg_rouge:.4f}")
print(f"Avg Keyword Overlap: {avg_keyword_overlap:.4f}")

# 保存为 CSV
results_df.to_csv("model_evaluation_results_7B.csv", index=False)
print("\n✅ 结果已保存为 model_evaluation_results.csv")


Using device: cuda


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 5/5 [01:47<00:00, 21.48s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
100%|██████████| 4241/4241 [5:06:48<00:00,  4.34s/it]  



Summary Metrics:
Accuracy: 0.8498
F1 Score: 0.5726
Avg BLEU-1: 0.2187
Avg BLEU-4: 0.0638
Avg ROUGE-L: 0.3241
Avg Keyword Overlap: 0.3158

✅ 结果已保存为 model_evaluation_results.csv
