In [2]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

2025-04-19 14:38:28.750681: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-19 14:38:29.343752: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-19 14:38:29.343836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-19 14:38:29.448520: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-19 14:38:29.660283: I tensorflow/core/platform/cpu_feature_guar

In [None]:
data = load_dataset('derek-thomas/ScienceQA', split='test') # choose the test set

In [None]:
from datasets import load_dataset

def build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                 split='test',
                                 keep_grades='1-6'):
    """
    构建按年级过滤的测试数据集（不限制样本数量）。

    参数:
        dataset_name (str): 数据集名称，例如 'derek-thomas/ScienceQA'。
        split (str): 数据分割，例如 'train', 'test', 'validation'。
        keep_grades (str or None): 筛选的年级段："1-6"、"7-12" 或 None 表示不过滤。

    返回:
        List[Dict]: 筛选后的样本列表。
    """

    def is_grade_allowed(grade_str):
        if keep_grades is None:
            return True
        try:
            grade_num = int(grade_str.replace("grade", ""))
            if keep_grades == "1-6":
                return 1 <= grade_num <= 6
            elif keep_grades == "7-12":
                return 7 <= grade_num <= 12
        except:
            return False
        return False

    data = load_dataset(dataset_name, split=split)
    test_dataset = []

    for i, sample in enumerate(data):
        try:
            if sample.get('question') is None:
                continue

            if not is_grade_allowed(sample.get("grade", "")):
                continue

            solution = sample.get("solution", "")
            lecture = sample.get("lecture", "")
            solution_lecture = f"{solution}\n\n{lecture}".strip()

            test_dataset.append({
                "image": sample.get("image", None), 
                "question": sample["question"],
                "choices": sample["choices"],
                # "hint": sample["hint"],
                "answer": sample["answer"],
                # "solution_lecture": solution_lecture,
                'grade':sample["grade"],
            })

        except Exception as e:
            print(f"跳过第 {i} 个样本，错误：{e}")
            continue

    return test_dataset

In [None]:
data = build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                    split='validation',
                                    keep_grades='1-6')

In [4]:
# 设置设备：如果有 GPU 就用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 加载模型到 GPU
model = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    device_map={"": device}, # 明确放到 cuda 或 cpu
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 # GPU 使用半精度更快
)

# 加载处理器
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

Using device: cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:49<00:00, 24.54s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

# 构造模型输入
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    question_text = f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    question_text += "Please select the correct answer."
    content.append({"type": "text", "text": question_text})
    return [{"role": "user", "content": content}]

# 解析模型输出中的选项字母
def parse_output(output):
    output = output.strip()
    match = re.search(r"\b([A-D])[\.\:\)]?", output)
    if match:
        return ord(match.group(1)) - 65  # A -> 0, B -> 1, etc.
    return -1

# 存放评估记录
all_records = []

# 模型预测与评估
for sample in tqdm(test_dataset):
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = [sample["image"]] if sample["image"] else None

    inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

    pred_answer = parse_output(output)

    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": sample["answer"],
        "Predicted Answer": pred_answer,
        "Model Output": output,
        "Correct": pred_answer == sample["answer"]
    })

# 汇总与导出
results_df = pd.DataFrame(all_records)
acc = accuracy_score(results_df["True Answer"], results_df["Predicted Answer"])
f1 = f1_score(results_df["True Answer"], results_df["Predicted Answer"], average='macro')

print("\nSummary Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

# 保存完整结果
results_df.to_csv("Qwen3B_1_6_just_QA.csv", index=False)

# 导出错误案例方便分析
error_df = results_df[results_df["Correct"] == False]
error_df.to_csv("Qwen3B_1_6_errors.csv", index=False)

# 可视化预测正确与否的分布（可选）
results_df["Correct"].value_counts().plot(kind='bar')
plt.xticks([0, 1], ['Wrong', 'Correct'], rotation=0)
plt.ylabel("Count")
plt.title("Prediction Correctness Distribution")
plt.tight_layout()
plt.show()


# with Hitn

In [5]:
from datasets import load_dataset

def build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                 split='test',
                                 keep_grades='1-6'):
    """
    构建按年级过滤的测试数据集（不限制样本数量）。

    参数:
        dataset_name (str): 数据集名称，例如 'derek-thomas/ScienceQA'。
        split (str): 数据分割，例如 'train', 'test', 'validation'。
        keep_grades (str or None): 筛选的年级段："1-6"、"7-12" 或 None 表示不过滤。

    返回:
        List[Dict]: 筛选后的样本列表。
    """

    def is_grade_allowed(grade_str):
        if keep_grades is None:
            return True
        try:
            grade_num = int(grade_str.replace("grade", "").strip())
            if keep_grades == "1-6":
                return 1 <= grade_num <= 6
            elif keep_grades == "7-12":
                return 7 <= grade_num <= 12
        except Exception:
            return False
        return False

    print(f"Loading dataset: {dataset_name}, split: {split}")
    data = load_dataset(dataset_name, split=split)
    test_dataset = []

    for i, sample in enumerate(data):
        try:
            if sample.get('question') is None:
                continue
            if not is_grade_allowed(sample.get("grade", "")):
                continue

            test_dataset.append({
                "image": sample.get("image", None), 
                "question": sample["question"],
                "choices": sample["choices"],
                "hint": sample.get("hint", ""),
                "answer": sample["answer"],
                "grade": sample.get("grade", "")
            })

        except Exception as e:
            print(f"跳过第 {i} 个样本，错误：{e}")
            continue

    print(f"最终保留样本数量: {len(test_dataset)}")
    return test_dataset


In [3]:
test_dataset = build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                           split='test',
                                           keep_grades='1-6')

Loading dataset: derek-thomas/ScienceQA, split: test
最终保留样本数量: 2724


In [6]:
test_dataset[10]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=312x237>,
 'question': 'Which better describes the Daintree rain forest ecosystem?',
 'choices': ['It has year-round rain. It also has soil that is poor in nutrients.',
  'It has cold winters. It also has many different types of organisms.'],
 'hint': 'Figure: Daintree rain forest.\nThe Daintree rain forest is a tropical rain forest ecosystem in northeastern Australia.',
 'answer': 0,
 'grade': 'grade3'}

In [8]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# 构造模型输入（自动添加 hint）
def build_message(sample):
    content = []
    if sample['image'] is not None:
        content.append({"type": "image", "image": sample['image']})
    
    question_text = ""
    if "hint" in sample and sample["hint"]:
        question_text += f"Hint: {sample['hint']}\n\n"

    question_text += f"Question: {sample['question']}\nChoices:\n"
    for idx, choice in enumerate(sample['choices']):
        question_text += f"{chr(65 + idx)}. {choice}\n"
    question_text += "Please select the correct answer."

    content.append({"type": "text", "text": question_text})
    return [{"role": "user", "content": content}]

# 解析模型输出中的选项字母
def parse_output(output):
    output = output.strip()

    # 尝试匹配 "Answer: A.", "Correct answer: B:", "B."
    match = re.search(r"(?:Answer\s*[:\-]?\s*|Correct answer\s*[:\-]?\s*|Option\s*[:\-]?\s*)?([A-D])[\.\:]", output, re.IGNORECASE)
    if match:
        return ord(match.group(1).upper()) - 65

    # fallback: 匹配第一个单独出现的 A-D
    match = re.search(r"\b([A-D])\b", output)
    if match:
        return ord(match.group(1).upper()) - 65

    return -1

# 存放评估记录
all_records = []
test_dataset = test_dataset[:2]
# 模型预测与评估
for sample in tqdm(test_dataset):
    messages = build_message(sample)
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = [sample["image"]] if sample["image"] else None

    inputs = processor(text=[text], images=image_inputs, return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    output = processor.batch_decode(generated_ids[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)[0]

    pred_answer = parse_output(output)

    all_records.append({
        "Question": sample["question"],
        "Choices": "\n".join(sample["choices"]),
        "True Answer": sample["answer"],
        "Predicted Answer": pred_answer,
        "Model Output": output,
        "Correct": pred_answer == sample["answer"]
    })

# 汇总与导出
results_df = pd.DataFrame(all_records)
acc = accuracy_score(results_df["True Answer"], results_df["Predicted Answer"])
f1 = f1_score(results_df["True Answer"], results_df["Predicted Answer"], average='macro')

print("\nSummary Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
results_df.to_csv("Qwen3B_1_6_just_QA.csv", index=False)
print("\n✅ 答案评估结果已保存为 Qwen3B_1_6_just_QA1.csv")


100%|██████████| 2/2 [00:01<00:00,  1.51it/s]


Summary Metrics:
Accuracy: 1.0000
F1 Score: 1.0000

✅ 答案评估结果已保存为 Qwen3B_1_6_just_QA1.csv



