In [None]:
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
from transformers import AutoProcessor
from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data = load_dataset('derek-thomas/ScienceQA', split='test') # choose the test set

In [None]:
from datasets import load_dataset

def build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                 split='test',
                                 keep_grades='1-6'):
    """
    构建按年级过滤的测试数据集（不限制样本数量）。

    参数:
        dataset_name (str): 数据集名称，例如 'derek-thomas/ScienceQA'。
        split (str): 数据分割，例如 'train', 'test', 'validation'。
        keep_grades (str or None): 筛选的年级段："1-6"、"7-12" 或 None 表示不过滤。

    返回:
        List[Dict]: 筛选后的样本列表。
    """

    def is_grade_allowed(grade_str):
        if keep_grades is None:
            return True
        try:
            grade_num = int(grade_str.replace("grade", ""))
            if keep_grades == "1-6":
                return 1 <= grade_num <= 6
            elif keep_grades == "7-12":
                return 7 <= grade_num <= 12
        except:
            return False
        return False

    data = load_dataset(dataset_name, split=split)
    test_dataset = []

    for i, sample in enumerate(data):
        try:
            if sample.get('question') is None:
                continue

            if not is_grade_allowed(sample.get("grade", "")):
                continue

            solution = sample.get("solution", "")
            lecture = sample.get("lecture", "")
            solution_lecture = f"{solution}\n\n{lecture}".strip()

            test_dataset.append({
                "image": sample.get("image", None), 
                "question": sample["question"],
                "choices": sample["choices"],
                # "hint": sample["hint"],
                "answer": sample["answer"],
                # "solution_lecture": solution_lecture,
                'grade':sample["grade"],
            })

        except Exception as e:
            print(f"跳过第 {i} 个样本，错误：{e}")
            continue

    return test_dataset

In [None]:
data = build_filtered_test_dataset(dataset_name='derek-thomas/ScienceQA',
                                    split='validation',
                                    keep_grades='1-6')