In [None]:
import os

# 从环境变量读取 OpenAI API 密钥
# 密钥应该已经在 ~/.bashrc 中设置，或者通过 export OPENAI_API_KEY=... 设置
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if OPENAI_API_KEY is None:
    raise ValueError(
        "OPENAI_API_KEY 环境变量未设置。请运行以下命令之一：\n"
        "1. export OPENAI_API_KEY='your-api-key'\n"
        "2. 或者在 ~/.bashrc 中添加: export OPENAI_API_KEY='your-api-key' 然后重启终端"
    )
print("✓ OpenAI API 密钥已从环境变量加载")

✓ OpenAI API 密钥已从环境变量加载


In [5]:
# !pip install openai transformers datasets accelerate torch pandas pyarrow tqdm
# 'accelerate' 是为了更快地加载和运行模型
# 'pyarrow' 是为了将 DataFrame 保存为 parquet 格式
# 'openai' 用于 GPT-3.5 API

import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from openai import OpenAI
from torch.nn.functional import softmax
from tqdm import tqdm
import os

# 检查是否有可用的 GPU (在 Colab 或本地)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# 初始化 OpenAI 客户端 (需要设置 OPENAI_API_KEY 环境变量)
client = OpenAI(api_key=OPENAI_API_KEY)

Using device: cuda


In [6]:
# 使用 GPT-3.5 Turbo
MODEL_ID = "gpt-3.5-turbo"

# 注意：GPT-3.5 通过 OpenAI API 使用，不需要本地加载模型
# 确保已设置 OPENAI_API_KEY 环境变量
print(f"Using model: {MODEL_ID} via OpenAI API")
print("Model ready to use.")

Using model: gpt-3.5-turbo via OpenAI API
Model ready to use.


In [7]:
# 获取 MMLU 数据集的所有可用主题
# 先获取所有主题，排除 'all'，然后选择前20个进行标注
print("Fetching all available MMLU subjects...")
from datasets import get_dataset_config_names

# 获取所有可用的配置（主题）
all_subjects = get_dataset_config_names("cais/mmlu")
print(f"Found {len(all_subjects)} subjects in MMLU dataset")

# 排除 'all' 这个subject
filtered_subjects = [s for s in all_subjects if s != 'all']
print(f"After filtering out 'all', {len(filtered_subjects)} subjects remain")

# 只选择前20个subject进行标注
SUBJECTS = filtered_subjects[21:]
print(f"Selected last {len(SUBJECTS)} subjects for processing:")
print(f"Subjects: {SUBJECTS}")

# 用于存储所有加载数据的字典
mmlu_data = {}

for subject in SUBJECTS:
    # MMLU 的 "test" 集是有标签的，"validation" 集是无标签的（用于官方提交）
    # 所以我们加载 "test" 集
    try:
        dataset = load_dataset("cais/mmlu", subject, split="test")
        mmlu_data[subject] = dataset
        print(f"Loaded {len(dataset)} questions for subject: {subject}")
    except Exception as e:
        print(f"Failed to load {subject}: {e}")

print(f"\nSuccessfully loaded {len(mmlu_data)} subjects out of {len(SUBJECTS)} selected subjects")

# MMLU 的选项
CHOICES = ["A", "B", "C", "D", "E", "F"]

Fetching all available MMLU subjects...
Found 59 subjects in MMLU dataset
After filtering out 'all', 58 subjects remain
Selected last 37 subjects for processing:
Subjects: ['high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']


high_school_computer_science/test-00000-(…):   0%|          | 0.00/27.3k [00:00<?, ?B/s]

high_school_computer_science/validation-(…):   0%|          | 0.00/5.28k [00:00<?, ?B/s]

high_school_computer_science/dev-00000-o(…):   0%|          | 0.00/6.54k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 100 questions for subject: high_school_computer_science


high_school_european_history/test-00000-(…):   0%|          | 0.00/142k [00:00<?, ?B/s]

high_school_european_history/validation-(…):   0%|          | 0.00/31.6k [00:00<?, ?B/s]

high_school_european_history/dev-00000-o(…):   0%|          | 0.00/22.2k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 165 questions for subject: high_school_european_history


high_school_geography/test-00000-of-0000(…):   0%|          | 0.00/28.2k [00:00<?, ?B/s]

high_school_geography/validation-00000-o(…):   0%|          | 0.00/6.16k [00:00<?, ?B/s]

high_school_geography/dev-00000-of-00001(…):   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/198 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 198 questions for subject: high_school_geography


high_school_government_and_politics/test(…):   0%|          | 0.00/40.2k [00:00<?, ?B/s]

high_school_government_and_politics/vali(…):   0%|          | 0.00/8.27k [00:00<?, ?B/s]

high_school_government_and_politics/dev-(…):   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/193 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 193 questions for subject: high_school_government_and_politics


high_school_macroeconomics/test-00000-of(…):   0%|          | 0.00/54.8k [00:00<?, ?B/s]

high_school_macroeconomics/validation-00(…):   0%|          | 0.00/9.89k [00:00<?, ?B/s]

high_school_macroeconomics/dev-00000-of-(…):   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/390 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/43 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 390 questions for subject: high_school_macroeconomics


high_school_mathematics/test-00000-of-00(…):   0%|          | 0.00/33.7k [00:00<?, ?B/s]

high_school_mathematics/validation-00000(…):   0%|          | 0.00/6.99k [00:00<?, ?B/s]

high_school_mathematics/dev-00000-of-000(…):   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/270 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 270 questions for subject: high_school_mathematics


high_school_microeconomics/test-00000-of(…):   0%|          | 0.00/38.8k [00:00<?, ?B/s]

high_school_microeconomics/validation-00(…):   0%|          | 0.00/7.22k [00:00<?, ?B/s]

high_school_microeconomics/dev-00000-of-(…):   0%|          | 0.00/3.83k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/238 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 238 questions for subject: high_school_microeconomics


high_school_physics/test-00000-of-00001.(…):   0%|          | 0.00/33.0k [00:00<?, ?B/s]

high_school_physics/validation-00000-of-(…):   0%|          | 0.00/7.96k [00:00<?, ?B/s]

high_school_physics/dev-00000-of-00001.p(…):   0%|          | 0.00/4.57k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/151 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 151 questions for subject: high_school_physics


high_school_psychology/test-00000-of-000(…):   0%|          | 0.00/92.8k [00:00<?, ?B/s]

high_school_psychology/validation-00000-(…):   0%|          | 0.00/15.2k [00:00<?, ?B/s]

high_school_psychology/dev-00000-of-0000(…):   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/545 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 545 questions for subject: high_school_psychology


high_school_statistics/test-00000-of-000(…):   0%|          | 0.00/58.0k [00:00<?, ?B/s]

high_school_statistics/validation-00000-(…):   0%|          | 0.00/10.9k [00:00<?, ?B/s]

high_school_statistics/dev-00000-of-0000(…):   0%|          | 0.00/6.07k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/216 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 216 questions for subject: high_school_statistics
Loaded 204 questions for subject: high_school_us_history


high_school_world_history/test-00000-of-(…):   0%|          | 0.00/202k [00:00<?, ?B/s]

high_school_world_history/validation-000(…):   0%|          | 0.00/38.5k [00:00<?, ?B/s]

high_school_world_history/dev-00000-of-0(…):   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/237 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/26 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 237 questions for subject: high_school_world_history


human_aging/test-00000-of-00001.parquet:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

human_aging/validation-00000-of-00001.pa(…):   0%|          | 0.00/6.28k [00:00<?, ?B/s]

human_aging/dev-00000-of-00001.parquet:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/223 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 223 questions for subject: human_aging


human_sexuality/test-00000-of-00001.parq(…):   0%|          | 0.00/23.2k [00:00<?, ?B/s]

human_sexuality/validation-00000-of-0000(…):   0%|          | 0.00/5.26k [00:00<?, ?B/s]

human_sexuality/dev-00000-of-00001.parqu(…):   0%|          | 0.00/4.08k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/131 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 131 questions for subject: human_sexuality


international_law/test-00000-of-00001.pa(…):   0%|          | 0.00/29.5k [00:00<?, ?B/s]

international_law/validation-00000-of-00(…):   0%|          | 0.00/7.12k [00:00<?, ?B/s]

international_law/dev-00000-of-00001.par(…):   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/121 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 121 questions for subject: international_law


jurisprudence/test-00000-of-00001.parque(…):   0%|          | 0.00/23.3k [00:00<?, ?B/s]

jurisprudence/validation-00000-of-00001.(…):   0%|          | 0.00/6.21k [00:00<?, ?B/s]

jurisprudence/dev-00000-of-00001.parquet:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/108 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 108 questions for subject: jurisprudence


logical_fallacies/test-00000-of-00001.pa(…):   0%|          | 0.00/23.0k [00:00<?, ?B/s]

logical_fallacies/validation-00000-of-00(…):   0%|          | 0.00/6.52k [00:00<?, ?B/s]

logical_fallacies/dev-00000-of-00001.par(…):   0%|          | 0.00/4.12k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 163 questions for subject: logical_fallacies


machine_learning/test-00000-of-00001.par(…):   0%|          | 0.00/19.7k [00:00<?, ?B/s]

machine_learning/validation-00000-of-000(…):   0%|          | 0.00/6.17k [00:00<?, ?B/s]

machine_learning/dev-00000-of-00001.parq(…):   0%|          | 0.00/5.25k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/112 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 112 questions for subject: machine_learning


management/test-00000-of-00001.parquet:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

management/validation-00000-of-00001.par(…):   0%|          | 0.00/4.50k [00:00<?, ?B/s]

management/dev-00000-of-00001.parquet:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/103 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 103 questions for subject: management


marketing/test-00000-of-00001.parquet:   0%|          | 0.00/37.3k [00:00<?, ?B/s]

marketing/validation-00000-of-00001.parq(…):   0%|          | 0.00/8.21k [00:00<?, ?B/s]

marketing/dev-00000-of-00001.parquet:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/234 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 234 questions for subject: marketing


medical_genetics/test-00000-of-00001.par(…):   0%|          | 0.00/16.4k [00:00<?, ?B/s]

medical_genetics/validation-00000-of-000(…):   0%|          | 0.00/5.63k [00:00<?, ?B/s]

medical_genetics/dev-00000-of-00001.parq(…):   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 100 questions for subject: medical_genetics


miscellaneous/test-00000-of-00001.parque(…):   0%|          | 0.00/98.6k [00:00<?, ?B/s]

miscellaneous/validation-00000-of-00001.(…):   0%|          | 0.00/13.2k [00:00<?, ?B/s]

miscellaneous/dev-00000-of-00001.parquet:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/86 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 783 questions for subject: miscellaneous


moral_disputes/test-00000-of-00001.parqu(…):   0%|          | 0.00/60.9k [00:00<?, ?B/s]

moral_disputes/validation-00000-of-00001(…):   0%|          | 0.00/10.7k [00:00<?, ?B/s]

moral_disputes/dev-00000-of-00001.parque(…):   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/346 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/38 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 346 questions for subject: moral_disputes


moral_scenarios/test-00000-of-00001.parq(…):   0%|          | 0.00/89.8k [00:00<?, ?B/s]

moral_scenarios/validation-00000-of-0000(…):   0%|          | 0.00/14.9k [00:00<?, ?B/s]

moral_scenarios/dev-00000-of-00001.parqu(…):   0%|          | 0.00/5.14k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/895 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 895 questions for subject: moral_scenarios


nutrition/test-00000-of-00001.parquet:   0%|          | 0.00/55.0k [00:00<?, ?B/s]

nutrition/validation-00000-of-00001.parq(…):   0%|          | 0.00/9.02k [00:00<?, ?B/s]

nutrition/dev-00000-of-00001.parquet:   0%|          | 0.00/4.99k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/306 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/33 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 306 questions for subject: nutrition
Loaded 311 questions for subject: philosophy


prehistory/test-00000-of-00001.parquet:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

prehistory/validation-00000-of-00001.par(…):   0%|          | 0.00/9.89k [00:00<?, ?B/s]

prehistory/dev-00000-of-00001.parquet:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/35 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 324 questions for subject: prehistory


professional_accounting/test-00000-of-00(…):   0%|          | 0.00/69.5k [00:00<?, ?B/s]

professional_accounting/validation-00000(…):   0%|          | 0.00/12.9k [00:00<?, ?B/s]

professional_accounting/dev-00000-of-000(…):   0%|          | 0.00/4.89k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/282 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/31 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 282 questions for subject: professional_accounting


professional_law/test-00000-of-00001.par(…):   0%|          | 0.00/1.04M [00:00<?, ?B/s]

professional_law/validation-00000-of-000(…):   0%|          | 0.00/116k [00:00<?, ?B/s]

professional_law/dev-00000-of-00001.parq(…):   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1534 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/170 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 1534 questions for subject: professional_law
Loaded 272 questions for subject: professional_medicine


professional_psychology/test-00000-of-00(…):   0%|          | 0.00/133k [00:00<?, ?B/s]

professional_psychology/validation-00000(…):   0%|          | 0.00/22.1k [00:00<?, ?B/s]

professional_psychology/dev-00000-of-000(…):   0%|          | 0.00/4.69k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/612 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/69 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 612 questions for subject: professional_psychology


public_relations/test-00000-of-00001.par(…):   0%|          | 0.00/20.6k [00:00<?, ?B/s]

public_relations/validation-00000-of-000(…):   0%|          | 0.00/6.45k [00:00<?, ?B/s]

public_relations/dev-00000-of-00001.parq(…):   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/110 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 110 questions for subject: public_relations


security_studies/test-00000-of-00001.par(…):   0%|          | 0.00/114k [00:00<?, ?B/s]

security_studies/validation-00000-of-000(…):   0%|          | 0.00/18.7k [00:00<?, ?B/s]

security_studies/dev-00000-of-00001.parq(…):   0%|          | 0.00/7.49k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/27 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 245 questions for subject: security_studies


sociology/test-00000-of-00001.parquet:   0%|          | 0.00/43.9k [00:00<?, ?B/s]

sociology/validation-00000-of-00001.parq(…):   0%|          | 0.00/8.36k [00:00<?, ?B/s]

sociology/dev-00000-of-00001.parquet:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/201 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 201 questions for subject: sociology


us_foreign_policy/test-00000-of-00001.pa(…):   0%|          | 0.00/19.5k [00:00<?, ?B/s]

us_foreign_policy/validation-00000-of-00(…):   0%|          | 0.00/5.27k [00:00<?, ?B/s]

us_foreign_policy/dev-00000-of-00001.par(…):   0%|          | 0.00/4.22k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 100 questions for subject: us_foreign_policy


virology/test-00000-of-00001.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

virology/validation-00000-of-00001.parqu(…):   0%|          | 0.00/7.05k [00:00<?, ?B/s]

virology/dev-00000-of-00001.parquet:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/166 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/18 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 166 questions for subject: virology


world_religions/test-00000-of-00001.parq(…):   0%|          | 0.00/18.9k [00:00<?, ?B/s]

world_religions/validation-00000-of-0000(…):   0%|          | 0.00/4.94k [00:00<?, ?B/s]

world_religions/dev-00000-of-00001.parqu(…):   0%|          | 0.00/3.30k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/171 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Loaded 171 questions for subject: world_religions

Successfully loaded 37 subjects out of 37 selected subjects


In [None]:
def format_mmlu_prompt(sample, subject_name):
    """
    将 MMLU 的一行数据格式化为 zero-shot CoT prompt。
    """
    subject_formatted = subject_name.replace("_", " ")
    question = sample['question']
    
    # 组合选项
    options = ""
    for i, choice in enumerate(sample['choices']):
        options += f"{CHOICES[i]}. {choice}\n"
    
    prompt = f"""The following is a multiple-choice question about {subject_formatted}. Please choose the single most likely answer.

Question: {question}
{options}
Answer:"""
    return prompt

def get_choice_probabilities(prompt, model_id, client, num_choices=None):
    """
    给定一个 prompt，计算模型对选项的概率。
    使用 OpenAI API 的 logprobs 功能。
    
    Args:
        prompt: 输入提示
        model_id: 模型ID
        client: OpenAI客户端
        num_choices: 选项数量（如果为None，则使用CHOICES的长度）
    """
    # 确定实际使用的选项数量
    if num_choices is None:
        num_choices = len(CHOICES)
    actual_choices = CHOICES[:num_choices]
    
    # 1. 准备选项 token (GPT-3.5 通常使用 " A", " B", " C", " D" 等格式)
    choice_tokens = [f" {choice}" for choice in actual_choices]
    
    # 2. 调用 OpenAI API 获取 logprobs (带重试机制)
    import time
    max_retries = 3
    retry_delay = 1  # 初始延迟（秒）
    
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                logprobs=True,  # 启用 logprobs
                top_logprobs=50,  # 获取 top 100 的 logprobs (增加以捕获更多选项token)
                max_tokens=1,  # 只生成一个 token
                temperature=0  # 使用确定性输出
            )
            break  # 成功则跳出重试循环
        except Exception as e:
            if attempt < max_retries - 1:
                # 如果是速率限制错误，等待更长时间
                if "rate limit" in str(e).lower() or "429" in str(e):
                    wait_time = retry_delay * (2 ** attempt)  # 指数退避
                    print(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 1}/{max_retries}...")
                    time.sleep(wait_time)
                else:
                    time.sleep(retry_delay * (2 ** attempt))
                continue
            else:
                # 最后一次尝试也失败，返回均匀分布
                print(f"Error calling OpenAI API after {max_retries} attempts: {e}")
                return np.ones(num_choices) / num_choices
    
    if response is None:
        return np.ones(num_choices) / num_choices
    
    # 3. 获取第一个（也是唯一的）token 的 logprobs
    if response.choices[0].logprobs and response.choices[0].logprobs.content:
        token_logprobs = response.choices[0].logprobs.content[0].top_logprobs
        # 创建一个字典，将 token 文本映射到 logprob
        logprob_dict = {item.token: item.logprob for item in token_logprobs}
    else:
        logprob_dict = {}
    
    # 4. 提取每个选项的 logprob
    choice_logprobs = []
    for choice_token in choice_tokens:
        choice_letter = choice_token.strip()  # 获取字母部分 (A, B, C, D, E, F)
        logprob = None
        
        # 尝试多种可能的 token 格式
        # 1. 带前导空格的格式: " A", " B", " C", etc.
        # 注意: choice_token 已经是 " A" 格式，所以直接使用
        if logprob is None:
            logprob = logprob_dict.get(choice_token, None)
        
        # 2. 不带空格的格式: "A", "B", "C", etc.
        if logprob is None:
            logprob = logprob_dict.get(choice_letter, None)
        
        # 3. 带点号的格式: "A.", "B.", "C.", etc.
        if logprob is None:
            logprob = logprob_dict.get(f"{choice_letter}.", None)
        
        # 4. 带前导空格和点号的格式: " A.", " B.", " C.", etc.
        if logprob is None:
            logprob = logprob_dict.get(f" {choice_letter}.", None)
        
        # 5. 规范化匹配：去除所有空格和标点后比较
        if logprob is None:
            for token, lp in logprob_dict.items():
                # 规范化 token：去除空格、点号等，只保留字母
                normalized_token = ''.join(c for c in token if c.isalpha())
                if normalized_token == choice_letter:
                    logprob = lp
                    break
        
        # 6. 大小写不敏感匹配
        if logprob is None:
            for token, lp in logprob_dict.items():
                normalized_token = ''.join(c for c in token if c.isalpha())
                if normalized_token.upper() == choice_letter.upper():
                    logprob = lp
                    break
        
        if logprob is None:
            # 如果找不到，使用一个合理的估计值
            # 使用已找到的选项中的最小 logprob 减去一个偏移量，或者使用一个很小的值
            if choice_logprobs:
                # 如果已经找到了一些选项的 logprob，使用最小值减去一个偏移
                min_found_logprob = min(choice_logprobs)  # 使用最小值作为参考
                logprob = min_found_logprob - 10.0  # 减去10，表示这个选项的概率明显更低
            else:
                # 如果所有选项都找不到，使用一个很小的值
                logprob = -100.0
            # 只在第一次找不到时打印调试信息
            if len(choice_logprobs) == 0:  # 只在第一个选项找不到时打印
                print(f"Warning: Could not find logprob for choice token '{choice_token}'")
                print(f"Available tokens (top 10): {list(logprob_dict.keys())[:10]}")
        
        choice_logprobs.append(logprob)
    
    # 5. 将 logprobs 转换为 logits (logprobs 已经是 log 概率)
    choice_logits = np.array(choice_logprobs)
    
    # 6. 应用 softmax 得到概率分布
    # 为了避免数值不稳定，减去最大值
    choice_logits_shifted = choice_logits - np.max(choice_logits)
    exp_logits = np.exp(choice_logits_shifted)
    choice_probs = exp_logits / np.sum(exp_logits)
    
    return choice_probs

In [9]:
def calculate_aps_score(probs, choice_index):
    """
    为 *一个* 假设的答案 (choice_index) 计算 APS 不一致性分数。
    S(X, y) = 1 - (所有 P_j >= P_y 的 P_j 的总和)
    
    Args:
    - probs (np.array): 概率数组，例如 [P(A), P(B), P(C), P(D), P(E), P(F)]
    - choice_index (int): 我们正在计算分数的那个选项 (0=A, 1=B, 2=C, 3=D, 4=E, 5=F)
    """
    
    # 1. 获取我们正在打分的这个选项的概率
    prob_y = probs[choice_index]
    
    # 2. 找到所有概率 >= prob_y 的选项
    indices_to_sum = np.where(probs >= prob_y)[0]
    
    # 3. 把它们的概率加起来
    # 为了处理浮点数精度问题，我们应该比较 probs >= prob_y - 1e-9
    prob_sum = 0
    for idx in indices_to_sum:
        # 再次检查，避免浮点数问题
        if probs[idx] >= prob_y - 1e-9:
            prob_sum += probs[idx]
            
    # 4. APS 分数
    score = 1.0 - prob_sum
    
    return score

# --- 快速测试一下我们的计分函数 ---
test_probs = np.array([0.42, 0.40, 0.10, 0.05, 0.02, 0.01])
# 选项 A: S(A) = 1 - P(A) = 1 - 0.45 = 0.55
# 选项 B: S(B) = 1 - (P(A) + P(B)) = 1 - (0.45 + 0.40) = 0.15
# 选项 C: S(C) = 1 - (P(A) + P(B) + P(C)) = 1 - (0.45 + 0.40 + 0.10) = 0.05
# 选项 D: S(D) = 1 - (P(A) + P(B) + P(C) + P(D)) = 1 - 1.0 = 0.0

print(f"Test S(A): {calculate_aps_score(test_probs, 0)}") # 应该约等于 0.55
print(f"Test S(B): {calculate_aps_score(test_probs, 1)}") # 应该约等于 0.15
print(f"Test S(C): {calculate_aps_score(test_probs, 2)}") # 应该约等于 0.05
print(f"Test S(D): {calculate_aps_score(test_probs, 3)}") # 应该约等于 0.0
print(f"Test S(E): {calculate_aps_score(test_probs, 4)}") # 应该约等于 0.02
print(f"Test S(F): {calculate_aps_score(test_probs, 5)}") # 应该约等于 0.01


Test S(A): 0.5800000000000001
Test S(B): 0.17999999999999994
Test S(C): 0.07999999999999996
Test S(D): 0.029999999999999916
Test S(E): 0.009999999999999898
Test S(F): 0.0


In [10]:
results_list = [] # 用于存储我们所有数据的列表

# 遍历我们加载的每个 MMLU 主题
for subject_name, dataset in mmlu_data.items():
    print(f"\nProcessing subject: {subject_name}...")
    
    # 遍历该主题中的所有问题
    for i, sample in enumerate(tqdm(dataset)):
        
        # 1. 格式化 prompt
        prompt = format_mmlu_prompt(sample, subject_name)
        
        # 2. 获取实际选项数量（MMLU问题可能有不同数量的选项）
        num_choices = len(sample['choices'])
        
        # 3. 获取概率分布 [P(A), P(B), P(C), ...]
        try:
            probabilities = get_choice_probabilities(prompt, MODEL_ID, client, num_choices=num_choices)
        except Exception as e:
            print(f"Error processing question {i}: {e}")
            continue
            
        # 4. 获取标准答案
        ground_truth_label = sample['answer'] # 这是一个 0 到 num_choices-1 的索引
        
        # 5. 为 *每一个* 选项计算 APS 分数
        for j in range(num_choices):
            
            aps_score = calculate_aps_score(probabilities, j)
            
            # 6. 结构化保存
            row = {
                "question_id": f"{subject_name}_{i}",
                "subject": subject_name,
                "question": sample['question'],
                "choice_str": CHOICES[j],          # A, B, C, D, E, or F
                "choice_index": j,
                "choice_text": sample['choices'][j],
                "probability": probabilities[j],   # 模型对这个选项的原始概率
                "aps_score": aps_score,            # 这个选项的 APS 不一致性分数
                "is_ground_truth": (j == ground_truth_label) # 这是一个 bool 值
            }
            results_list.append(row)

print("\nAll processing complete.")


Processing subject: high_school_computer_science...


 17%|█▋        | 17/100 [00:09<00:25,  3.26it/s]

Available tokens (top 10): ['C', 'D', ' C', ' D', 'B', 'Option', '(C', '\n', '(D', 'The']


 58%|█████▊    | 58/100 [00:26<00:14,  2.85it/s]

Available tokens (top 10): ['D', ' D', '\n', 'Program', '\n\n', ' ', 'Option', '(D', '\u200b', '.']


 61%|██████    | 61/100 [00:27<00:11,  3.43it/s]

Available tokens (top 10): ['D', ' D', 'Option', 'C', 'The', 'B', ' ', '\n\n', '4', 'Max']


100%|██████████| 100/100 [00:40<00:00,  2.47it/s]



Processing subject: high_school_european_history...


 82%|████████▏ | 136/165 [00:47<00:08,  3.55it/s]

Available tokens (top 10): ['D', 'C', ' D', 'Ne', ' C', ' Ne', ' ', '\n', '.', 'The']


100%|██████████| 165/165 [00:57<00:00,  2.88it/s]



Processing subject: high_school_geography...


 25%|██▌       | 50/198 [00:15<00:40,  3.61it/s]

Available tokens (top 10): ['B', ' B', 'C', 'Option', '\n', 'The', 'D', 'b', ' \n', ' ']


100%|██████████| 198/198 [01:00<00:00,  3.25it/s]



Processing subject: high_school_government_and_politics...


 12%|█▏        | 23/193 [00:07<00:49,  3.45it/s]

Available tokens (top 10): ['D', ' D', 'Brown', '\n', ' ', 'The', '**', '\n\n', '(D', 'Option']


 56%|█████▋    | 109/193 [00:34<00:27,  3.11it/s]

Available tokens (top 10): ['D', ' D', 'The', ' ', '\n', 'the', '\n\n', '.', 'B', '(D']


 80%|████████  | 155/193 [00:51<00:13,  2.90it/s]

Available tokens (top 10): ['D', ' D', ' ', '\n', 'The', '\n\n', '**', '(D', 'Four', '  ']


 95%|█████████▌| 184/193 [01:04<00:07,  1.23it/s]

Available tokens (top 10): ['D', ' D', ' ', '(D', '\n', 'The', 'John', '\n\n', '.', 'B']


100%|██████████| 193/193 [01:07<00:00,  2.87it/s]



Processing subject: high_school_macroeconomics...


 65%|██████▌   | 254/390 [01:23<00:37,  3.66it/s]

Available tokens (top 10): ['C', 'D', ' C', ' D', '\n', 'are', '\n\n', ' ', 'Option', '\u200b']


100%|██████████| 390/390 [02:08<00:00,  3.03it/s]



Processing subject: high_school_mathematics...


100%|██████████| 270/270 [01:34<00:00,  2.86it/s]



Processing subject: high_school_microeconomics...


 42%|████▏     | 101/238 [00:29<00:43,  3.16it/s]

Available tokens (top 10): ['D', ' D', 'Op', 'Option', ' ', 'op', '\n', '\n\n', 'C', ' Opportunity']


100%|██████████| 238/238 [01:13<00:00,  3.25it/s]



Processing subject: high_school_physics...


100%|██████████| 151/151 [00:46<00:00,  3.25it/s]



Processing subject: high_school_psychology...


 10%|█         | 55/545 [00:16<02:35,  3.16it/s]

Available tokens (top 10): ['C', ' C', 'B', ' B', '\n', 'c', ' ', 'D', '\n\n', 'Post']


 15%|█▌        | 83/545 [00:24<02:39,  2.89it/s]

Available tokens (top 10): ['D', ' D', '\n', 'Human', ' ', '\n\n', 'C', '.', 'The', '(D']


 16%|█▌        | 85/545 [00:25<02:24,  3.18it/s]

Available tokens (top 10): ['D', ' D', 'Social', 'social', '\n', '\n\n', ' ', '(D', 'C', ' \n']


 27%|██▋       | 149/545 [00:44<01:55,  3.43it/s]

Available tokens (top 10): ['C', ' C', 'Syntax', 'B', '\n', '\n\n', 'D', ' ', 'c', 'syntax']


 29%|██▉       | 158/545 [00:47<01:53,  3.40it/s]

Available tokens (top 10): ['D', ' D', '\n', 'hyp', ' ', '\n\n', 'H', 'The', '(D', '**']


 41%|████      | 221/545 [01:06<01:28,  3.66it/s]

Available tokens (top 10): ['D', ' D', '\n', ' ', '\n\n', '(D', '.', 'The', ' \n', 'C']


 50%|█████     | 273/545 [01:24<01:42,  2.65it/s]

Available tokens (top 10): ['D', ' D', '\n', 'Meta', '\n\n', ' ', ' Meta', '.', ' \n', '  ']


 52%|█████▏    | 284/545 [01:28<01:23,  3.13it/s]

Available tokens (top 10): ['D', ' D', 'In', 'C', '\n', '\n\n', '(D', 'in', ' ', 'B']


 63%|██████▎   | 344/545 [01:47<01:53,  1.77it/s]

Available tokens (top 10): ['D', ' D', 'Con', 'C', '\n', 'con', ' ', '\n\n', '(D', 'B']


 72%|███████▏  | 390/545 [02:01<00:58,  2.63it/s]

Available tokens (top 10): ['D', 'C', ' D', ' C', 'Comp', '\n', '\n\n', 'c', 'comp', ' ']


 76%|███████▌  | 413/545 [02:08<00:36,  3.60it/s]

Available tokens (top 10): ['D', ' D', 'Group', 'C', '\n', 'B', '\n\n', 'group', ' ', ' Group']


 93%|█████████▎| 508/545 [02:40<00:10,  3.59it/s]

Available tokens (top 10): ['D', ' D', 'Sleep', '\n', '\n\n', ' ', '**', '\u200b', 'Option', 'The']


 97%|█████████▋| 531/545 [02:47<00:04,  2.90it/s]

Available tokens (top 10): ['D', ' D', ' ', '\n', 'S', '(D', '\n\n', 's', 'Le', '.']


 99%|█████████▉| 542/545 [02:50<00:00,  3.54it/s]

Available tokens (top 10): ['D', ' D', '\n\n', ' ', '\n', 'C', 'Ins', 'ins', 'The', '(D']


100%|██████████| 545/545 [02:51<00:00,  3.18it/s]



Processing subject: high_school_statistics...


  2%|▏         | 5/216 [00:42<37:55, 10.78s/it]

Available tokens (top 10): ['D', ' D', 'B', '\n', 'Option', '\n\n', '(D', ' ', 'Answer', 'An']


100%|██████████| 216/216 [01:48<00:00,  1.98it/s]



Processing subject: high_school_us_history...


 13%|█▎        | 27/204 [00:07<00:48,  3.63it/s]

Available tokens (top 10): ['D', ' D', 'Impro', 'Im', '\n', '.', 'B', 'Option', '**', '(D']


100%|██████████| 204/204 [01:04<00:00,  3.18it/s]



Processing subject: high_school_world_history...


 84%|████████▍ | 200/237 [01:04<00:10,  3.57it/s]

Available tokens (top 10): ['D', ' D', 'The', '\n', '(D', ' ', ' The', '\n\n', 'B', '**']


100%|██████████| 237/237 [01:15<00:00,  3.14it/s]



Processing subject: human_aging...


100%|██████████| 223/223 [01:18<00:00,  2.85it/s]



Processing subject: human_sexuality...


100%|██████████| 131/131 [00:39<00:00,  3.35it/s]



Processing subject: international_law...


100%|██████████| 121/121 [00:38<00:00,  3.11it/s]



Processing subject: jurisprudence...


100%|██████████| 108/108 [00:41<00:00,  2.61it/s]



Processing subject: logical_fallacies...


 33%|███▎      | 53/163 [00:20<00:29,  3.78it/s]

Available tokens (top 10): ['D', ' D', 'False', '\n', ' False', ' ', '\n\n', 'false', 'B', ' \n']


 55%|█████▍    | 89/163 [00:33<00:29,  2.48it/s]

Available tokens (top 10): ['C', 'D', ' C', ' D', '\n', '\n\n', 'ir', 'Ir', 'I', ' ']


 87%|████████▋ | 141/163 [00:49<00:07,  2.95it/s]

Available tokens (top 10): ['D', ' D', 'Equ', '\n', '\n\n', ' Equ', ' ', 'E', ' \n', 'The']


100%|██████████| 163/163 [00:58<00:00,  2.80it/s]



Processing subject: machine_learning...


100%|██████████| 112/112 [00:36<00:00,  3.04it/s]



Processing subject: management...


 89%|████████▉ | 92/103 [00:30<00:04,  2.37it/s]

Available tokens (top 10): ['D', 'C', ' D', ' C', '\n', '\n\n', ' ', 'Option', 'c', 'B']


100%|██████████| 103/103 [00:36<00:00,  2.84it/s]



Processing subject: marketing...


 38%|███▊      | 89/234 [00:29<00:49,  2.92it/s]

Available tokens (top 10): ['D', ' D', 'Trademark', '\n', 'C', 'Trad', ' ', '\n\n', ' Trad', 'Option']


 47%|████▋     | 109/234 [00:35<00:33,  3.75it/s]

Available tokens (top 10): ['D', ' D', 'Group', '\n', 'C', ' ', '\n\n', 'Psych', 'The', '.']


 57%|█████▋    | 133/234 [00:42<00:31,  3.18it/s]

Available tokens (top 10): ['D', ' D', 'Prom', '\n', '\n\n', ' ', ' \n', 'B', ' Promotion', '.']


100%|██████████| 234/234 [01:13<00:00,  3.17it/s]



Processing subject: medical_genetics...


100%|██████████| 100/100 [00:33<00:00,  2.98it/s]



Processing subject: miscellaneous...


 12%|█▏        | 92/783 [00:30<06:50,  1.68it/s]

Available tokens (top 10): ['D', ' D', 'C', 'Sher', ' C', ' ', '\n', ' Sher', '\n\n', 'Option']


 15%|█▌        | 119/783 [00:39<03:46,  2.93it/s]

Available tokens (top 10): ['D', ' D', 'Har', 'The', '\n', ' ', '(D', 'C', '\n\n', ' Harlem']


 19%|█▊        | 145/783 [00:50<04:42,  2.26it/s]

Available tokens (top 10): ['D', ' D', 'B', 'institution', 'In', ' ', '\n', '\n\n', 'C', ' B']


 48%|████▊     | 377/783 [02:15<02:32,  2.65it/s]

Available tokens (top 10): ['C', ' C', 'Cr', '\n', 'c', '\n\n', 'cr', 'D', 'B', 'Option']


 50%|████▉     | 390/783 [02:19<03:23,  1.93it/s]

Available tokens (top 10): ['D', ' D', 'B', ' ', 'On', 'Br', 'br', '\n', 'Option', '\n\n']


 77%|███████▋  | 605/783 [03:31<00:47,  3.76it/s]

Available tokens (top 10): ['D', ' D', 'San', ' ', ' San', '\n', '\n\n', 'C', '(D', '.']


 87%|████████▋ | 680/783 [03:53<00:31,  3.25it/s]

Available tokens (top 10): ['D', ' D', 'Internet', ' ', '\n', 'Option', '(D', ' Internet', '\n\n', ' \n']


100%|██████████| 783/783 [04:25<00:00,  2.94it/s]



Processing subject: moral_disputes...


100%|██████████| 346/346 [02:04<00:00,  2.78it/s]



Processing subject: moral_scenarios...


100%|██████████| 895/895 [05:08<00:00,  2.90it/s]



Processing subject: nutrition...


 51%|█████     | 156/306 [00:46<00:38,  3.93it/s]

Available tokens (top 10): ['D', ' D', 'C', 'Pro', '\n', ' ', '\n\n', ' Prote', ' C', 'B']


100%|██████████| 306/306 [01:34<00:00,  3.24it/s]



Processing subject: philosophy...


100%|██████████| 311/311 [01:50<00:00,  2.82it/s]



Processing subject: prehistory...


 67%|██████▋   | 216/324 [01:13<00:30,  3.58it/s]

Available tokens (top 10): ['D', ' D', 'C', '\n', ' C', ' ', '\n\n', 'Option', 'It', 'B']


100%|██████████| 324/324 [01:46<00:00,  3.03it/s]



Processing subject: professional_accounting...


100%|██████████| 282/282 [01:38<00:00,  2.85it/s]



Processing subject: professional_law...


100%|██████████| 1534/1534 [10:00<00:00,  2.55it/s]  



Processing subject: professional_medicine...


100%|██████████| 272/272 [01:27<00:00,  3.11it/s]



Processing subject: professional_psychology...


  2%|▏         | 13/612 [00:03<02:40,  3.74it/s]

Available tokens (top 10): ['D', ' D', 'C', ' C', '\n', ' ', 'The', '\n\n', '(D', 'Option']


 84%|████████▍ | 517/612 [02:47<00:27,  3.46it/s]

Available tokens (top 10): ['C', ' C', 'B', '\n', 'D', ' B', '\n\n', 'c', ' ', 'Option']


100%|██████████| 612/612 [03:22<00:00,  3.02it/s]



Processing subject: public_relations...


100%|██████████| 110/110 [00:33<00:00,  3.30it/s]



Processing subject: security_studies...


 50%|████▉     | 122/245 [00:41<00:32,  3.84it/s]

Available tokens (top 10): ['C', 'D', ' C', ' D', '\n', 'B', '\n\n', 'c', ' ', 'Option']


 65%|██████▍   | 159/245 [00:56<00:25,  3.37it/s]

Available tokens (top 10): ['D', ' D', 'B', ' B', 'Option', 'C', '\n', ' ', '\n\n', ' Option']


100%|██████████| 245/245 [01:24<00:00,  2.90it/s]



Processing subject: sociology...


 43%|████▎     | 87/201 [00:25<00:30,  3.80it/s]

Available tokens (top 10): ['D', ' D', '\n', '(D', 'B', '\n\n', ' ', 'Social', 'Option', '\u200b']


 44%|████▍     | 89/201 [00:26<00:29,  3.76it/s]

Available tokens (top 10): ['C', ' C', 'D', ' D', 'B', ' ', '\n', 'c', '(C', '\n\n']


 88%|████████▊ | 176/201 [00:51<00:06,  3.70it/s]

Available tokens (top 10): ['D', ' D', 'B', '\n', 'C', '\n\n', 'Option', ' ', '(D', ' B']


100%|██████████| 201/201 [00:59<00:00,  3.41it/s]



Processing subject: us_foreign_policy...


  6%|▌         | 6/100 [00:02<00:47,  1.98it/s]

Available tokens (top 10): ['D', 'B', ' D', ' B', 'C', ' ', 'Option', ' C', '\n', '(D']


100%|██████████| 100/100 [00:32<00:00,  3.09it/s]



Processing subject: virology...


 78%|███████▊  | 129/166 [00:40<00:16,  2.18it/s]

Available tokens (top 10): ['D', ' D', 'C', ' ', '\n', 'Direct', 'The', '\n\n', '(D', '.']


100%|██████████| 166/166 [00:52<00:00,  3.19it/s]



Processing subject: world_religions...


 28%|██▊       | 48/171 [00:15<00:33,  3.70it/s]

Available tokens (top 10): ['D', ' D', 'B', ' ', 'The', '\n', 'Ant', '\n\n', '(D', 'C']


 81%|████████▏ | 139/171 [00:48<00:13,  2.40it/s]

Available tokens (top 10): ['D', ' D', 'C', ' ', '\n', '\n\n', '(D', 'The', ' \n', 'Dao']


100%|██████████| 171/171 [00:58<00:00,  2.93it/s]


All processing complete.





In [11]:
# 转换为 Pandas DataFrame
df_scores = pd.DataFrame(results_list)

# 保存到 Parquet 文件 (比 CSV 更高效)
# 修复 PyArrow 文件系统注册错误
import os
import pyarrow as pa
import pyarrow.parquet as pq

# 使用绝对路径
output_filename = os.path.abspath("mmlu_with_aps_scores_p2.parquet")

# 尝试使用 PyArrow 的低级 API 直接写入，避免文件系统注册问题
try:
    # 方法1: 使用 pyarrow.parquet.write_table 直接写入
    table = pa.Table.from_pandas(df_scores, preserve_index=False)
    pq.write_table(table, output_filename)
except Exception as e:
    if "ArrowKeyError" in str(type(e).__name__) or "already registered" in str(e):
        # 方法2: 使用 pandas 的 to_parquet，但指定 engine
        try:
            df_scores.to_parquet(output_filename, index=False, engine='pyarrow')
        except:
            # 方法3: 尝试 fastparquet
            try:
                df_scores.to_parquet(output_filename, index=False, engine='fastparquet')
            except ImportError:
                # 方法4: 如果都失败，使用 CSV 作为后备
                csv_filename = output_filename.replace('.parquet', '.csv')
                print(f"Warning: PyArrow filesystem registry error. Saving as CSV instead: {csv_filename}")
                df_scores.to_csv(csv_filename, index=False)
                output_filename = csv_filename
            except Exception as e2:
                csv_filename = output_filename.replace('.parquet', '.csv')
                print(f"Warning: Could not save as Parquet ({e2}). Saving as CSV instead: {csv_filename}")
                df_scores.to_csv(csv_filename, index=False)
                output_filename = csv_filename
    else:
        raise

# 计算每个问题的选项数量（通过统计每个question_id的行数）
questions_per_row = df_scores.groupby('question_id').size()
avg_choices = questions_per_row.mean()
print(f"Successfully processed {len(df_scores)} rows ({len(questions_per_row)} questions).")
print(f"Average number of choices per question: {avg_choices:.1f}")
print(f"Data saved to {output_filename}")

# --- 验证一下我们的数据 ---
print("\n--- DataFrame Head ---")
print(df_scores.head(8))

print("\n--- Example: Scores for one question ---")
print(df_scores[df_scores['question_id'] == 'philosophy_0'][
    ['question_id', 'choice_str', 'probability', 'aps_score', 'is_ground_truth']
])

Successfully processed 43440 rows (10860 questions).
Average number of choices per question: 4.0
Data saved to /egr/research-hintlab/liuxin73/projects/conformal-factual-lm/ACI/MMLU/mmlu_with_aps_scores_p2.csv

--- DataFrame Head ---
                      question_id                       subject  \
0  high_school_computer_science_0  high_school_computer_science   
1  high_school_computer_science_0  high_school_computer_science   
2  high_school_computer_science_0  high_school_computer_science   
3  high_school_computer_science_0  high_school_computer_science   
4  high_school_computer_science_1  high_school_computer_science   
5  high_school_computer_science_1  high_school_computer_science   
6  high_school_computer_science_1  high_school_computer_science   
7  high_school_computer_science_1  high_school_computer_science   

                                            question choice_str  choice_index  \
0             Let x = 1. What is x << 3 in Python 3?          A             0   
1