In [1]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch

In [2]:
from PIL import Image
from tqdm import tqdm
from transformers import AutoTokenizer
import numpy as np
from torchvision import transforms

def build_filtered_dataset(dataset_name='derek-thomas/ScienceQA',
                           split='train',
                           keep_grades='1-6'):
    """
    构建按年级和图像存在性过滤的数据集。

    参数:
        dataset_name (str): 数据集名称，例如 'derek-thomas/ScienceQA'。
        split (str): 数据分割，例如 'train', 'test', 'validation'。
        keep_grades (str or None): 筛选的年级段："1-6"、"7-12" 或 None 表示不过滤。

    返回:
        List[Dict]: 筛选后的样本列表。
    """

    def is_grade_allowed(grade_str):
        if keep_grades is None:
            return True
        try:
            grade_num = int(grade_str.replace("grade", ""))
            if keep_grades == "1-6":
                return 1 <= grade_num <= 6
            elif keep_grades == "7-12":
                return 7 <= grade_num <= 12
        except:
            return False
        return False



    data = load_dataset(dataset_name, split=split)
    dataset = []

    for i, sample in enumerate(data):
        try:
            if sample.get('question') is None:
                continue
            
            if sample.get("image", None) is None:
                continue

            if not is_grade_allowed(sample.get("grade", "")):
                continue

            solution = sample.get("solution", "")
            lecture = sample.get("lecture", "")
            solution_lecture = f"{solution}\n\n{lecture}".strip()
            
            image = sample["image"].convert("RGB")
            

            # image = np.array(image)
            # image = torch.tensor(image).permute(2, 0, 1)  # shape: (C, H, W)
            dataset.append({
                "image": image, 
                "question": sample["question"],
                "choices": sample["choices"],
                "hint": sample["hint"],
                "answer": sample["answer"],
                "solution_lecture": solution_lecture,
                'grade':sample["grade"],
            })
            
        except Exception as e:
            print(f"跳过第 {i} 个样本，错误：{e}")
            continue
    return dataset

data = build_filtered_dataset(split='test', keep_grades='1-6')
print(f"\n✅ 筛选后的样本数量: {len(data)}")


✅ 筛选后的样本数量: 1429


In [3]:
data[0]

{'image': <PIL.Image.Image image mode=RGB size=452x595>,
 'question': 'What is the name of the colony shown?',
 'choices': ['Maryland', 'New Hampshire', 'Rhode Island', 'Vermont'],
 'hint': '',
 'answer': 1,
 'solution_lecture': 'The colony is New Hampshire.\nDuring the colonial era, New Hampshire and New York both claimed the territory that would later become the state of Vermont. Vermont was never its own colony.',
 'grade': 'grade5'}

In [12]:
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import PeftModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    device_map={"": device},
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

model = PeftModel.from_pretrained(
    base_model,
    "/root/IC_MLLM_VQA/ScienceQA/Pre_fix/qwen2.5vl-prefix2.03B/checkpoint-1360"
)
model.eval()
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [17]:

# 构造消息
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    
    if sample.get("hint"):
        question_text += f"\nHint: {sample['hint']}\n"
    
    question_text += "Please select the correct answer. Then, explain your reasoning in detail. "
    content.append({"type": "text", "text": question_text})
    
    return [{"role": "user", "content": content}]

# 解析模型输出
import re

def parse_output(output):
    output = output.strip()

    # case 1: "Answer: A Explanation: xxx"
    match = re.search(r"Answer[:：]?\s*([A-D])\b.*?Explanation[:：]?\s*(.+)", output, re.DOTALL)
    if match:
        answer = ord(match.group(1)) - 65
        explanation = match.group(2).strip()
        return answer, explanation

    # case 2: "A Explanation: xxx"
    match = re.match(r"\b([A-D])\s*Explanation[:：]?\s*(.+)", output, re.DOTALL)
    if match:
        answer = ord(match.group(1)) - 65
        explanation = match.group(2).strip()
        return answer, explanation

    # case 3: "A. xxx" or "B: xxx"
    match = re.match(r"\b([A-D])[\.:]\s*(.+)", output, re.DOTALL)
    if match:
        answer = ord(match.group(1)) - 65
        explanation = match.group(2).strip()
        return answer, explanation

    # case 4: only one letter like "C"
    match = re.match(r"^\s*([A-D])\s*$", output)
    if match:
        answer = ord(match.group(1)) - 65
        return answer, ""

    # fallback: try to find first capital letter A-D (unsafe)
    match = re.search(r"\b([A-D])\b", output)
    if match:
        answer = ord(match.group(1)) - 65
        explanation = output[match.end():].strip()
        return answer, explanation

    return -1, ""


# 准备测试数据
N = len(data)
test_dataset = []

for i in range(N):
    sample = data[i]
    try:
        if sample['question'] is None:
            print(f"第 {i} 个样本没有问题，跳过")
            continue

        solution = sample.get("solution", "")
        lecture = sample.get("lecture", "")
        solution_lecture = f"{solution}\n\n{lecture}".strip()

        test_dataset.append({
            "image": sample.get("image", None),
            "question": sample["question"],
            "choices": sample["choices"],
            "answer": sample["answer"],  # 是选项文本
            "hint": sample.get("hint", None),
            "solution_lecture":sample['solution_lecture']
        })
    except Exception as e:
        print(f"跳过第 {i} 个样本，错误：{e}")
        continue

# 初始化工具
rouge = Rouge()
smoothie = SmoothingFunction().method1
vectorizer = CountVectorizer(stop_words="english").fit([s["solution_lecture"] for s in test_dataset])
keywords = set(vectorizer.get_feature_names_out())

# 开始评估
all_records = []

for sample in tqdm(test_dataset):
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = [sample["image"]] if sample["image"] else None
    
    inputs = processor(
    text=[text],
    images=image_inputs,
    padding="max_length",
    return_tensors="pt").to(device)
    
    input_len = inputs["input_ids"].shape[1]
    
    print("input_ids shape:", inputs["input_ids"].shape)
    print("attention_mask shape:", inputs["attention_mask"].shape)
    print("input_ids:", inputs["input_ids"])
    print("attention_mask:", inputs["attention_mask"])
    
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    pred_answer, pred_explanation = parse_output(output)
    true_answer = sample["answer"]

    # BLEU-1 和 BLEU-4
    reference = sample["solution_lecture"].split()
    hypothesis = pred_explanation.split()
    bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0), smoothing_function=smoothie)
    bleu4 = sentence_bleu([reference], hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)

    # ROUGE-L
    try:
        rouge_score = rouge.get_scores(pred_explanation, sample["solution_lecture"])[0]["rouge-l"]["f"]
    except:
        rouge_score = 0.0

    # 关键词重叠
    gt_tokens = set(sample["solution_lecture"].lower().split())
    pred_tokens = set(pred_explanation.lower().split())
    overlap = len(gt_tokens & pred_tokens & keywords)
    keyword_score = overlap / max(len(gt_tokens & keywords), 1)

    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": true_answer,
        "Predicted Answer": pred_answer,
        "Predicted Answer Text": sample["choices"][pred_answer] if 0 <= pred_answer < len(sample["choices"]) else "N/A",
        "Model Output": output,
        "Model Explanation": pred_explanation,
        "Reference Solution": sample["solution_lecture"],
        "BLEU-1": bleu1,
        "BLEU-4": bleu4,
        "ROUGE-L": rouge_score,
        "Keyword Overlap": keyword_score
    })

# 结果表格
results_df = pd.DataFrame(all_records)

# 汇总评估
acc = accuracy_score(results_df["True Answer"], results_df["Predicted Answer"])
f1 = f1_score(results_df["True Answer"], results_df["Predicted Answer"], average='macro')
avg_bleu1 = results_df["BLEU-1"].mean()
avg_bleu4 = results_df["BLEU-4"].mean()
avg_rouge = results_df["ROUGE-L"].mean()
avg_keyword_overlap = results_df["Keyword Overlap"].mean()

# 打印汇总结果
print("\nSummary Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Avg BLEU-1: {avg_bleu1:.4f}")
print(f"Avg BLEU-4: {avg_bleu4:.4f}")
print(f"Avg ROUGE-L: {avg_rouge:.4f}")
print(f"Avg Keyword Overlap: {avg_keyword_overlap:.4f}")

# 保存为 CSV
results_df.to_csv("model_evaluation_results_7B_1_6_lora.csv", index=False)
print("\n✅ 结果已保存为 model_evaluation_results.csv")


  0%|          | 0/1429 [00:00<?, ?it/s]

input_ids shape: torch.Size([1, 131072])
attention_mask shape: torch.Size([1, 131072])
input_ids: tensor([[151644,   8948,    198,  ..., 151643, 151643, 151643]],
       device='cuda:0')
attention_mask: tensor([[1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')





IndexError: The shape of the mask [131084] at index 0 does not match the shape of the indexed tensor [131072] at index 0