In [None]:
import json
import math
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------------
# 配置
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
input_file = "data/hendrycks_math_train.json"
acc_file = "output/deepseek-r1-1.5b-generated-predictions-detailed-results.jsonl"
output_file = "z_score/deepseek-r1-1.5b_ppl_conf_acc_z_scores_results.json"

# 模型统计量
model_stats = {
    "ppl_mean": 9.795982360839844,
    "ppl_std": 22.284496307373047,
    "conf_mean": 0.6799513101577759,
    "conf_std": 0.08082679659128189
}

# -------------------------------
# 加载模型和 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto").eval()

# -------------------------------
# 加载数据
with open(input_file, "r") as f:
    data = json.load(f)

with open(acc_file, "r") as f:
    acc_data = [json.loads(line) for line in f]

assert len(data) == len(acc_data), "样本数量不匹配"

# -------------------------------
# 计算 PPL 和 Confidence
def compute_ppl_and_conf(text):
    inputs = tokenizer(text, return_tensors="pt").to("mps")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        logits = outputs.logits
        ppl = math.exp(loss.item())

        probs = torch.softmax(logits, dim=-1)
        max_probs = probs.max(dim=-1).values
        conf = max_probs[0, 1:-1].mean().item()  # exclude BOS & EOS

    return ppl, conf

# -------------------------------
# 批量处理
results = []
for sample, acc_record in tqdm(zip(data, acc_data), total=len(data)):
    input_text = sample["input"]
    acc = 1 if acc_record.get("accuracy", 0.0) >= 99.9 else 0


    try:
        ppl, conf = compute_ppl_and_conf(input_text)
        z_ppl = (ppl - model_stats["ppl_mean"]) / model_stats["ppl_std"]
        z_conf = (conf - model_stats["conf_mean"]) / model_stats["conf_std"]
        results.append({
            "z_ppl": z_ppl,
            "z_conf": z_conf,
            "acc": acc
        })
    except Exception as e:
        results.append({
            "error": str(e),
            "acc": acc
        })

# -------------------------------
# 保存结果
with open(output_file, "w") as f:
    json.dump(results, f, indent=2)

print(f"✅ 完成，已保存到: {output_file}")


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Batch-compute PPL / confidence / z-score / acc for multiple models.
"""

import json, math, os, gc, sys, logging
from typing import Dict, List
import torch, torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

# ────────────────────────────────────────────────────────────────────────────────
# 0. 环境 / 日志
# ────────────────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="[\033[1;34m%(levelname)s\033[0m] %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)
logger = logging.getLogger(__name__)

DEVICE  = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
DTYPE16 = torch.float16 if DEVICE == "cuda" else (torch.bfloat16 if torch.cuda.is_available() else None)

# ────────────────────────────────────────────────────────────────────────────────
# 1. 统计量 & 模型-数据配置
# ────────────────────────────────────────────────────────────────────────────────
MODEL_STATS: Dict[str, Dict[str, float]] = {
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {
        "ppl_mean": 9.795982360839844,  "ppl_std": 22.284496307373047,
        "conf_mean": 0.6799513101577759, "conf_std": 0.08082679659128189,
    },
    "Qwen/Qwen3-4B": {
        "ppl_mean": 6.160105228424072,  "ppl_std": 6.118084907531738,
        "conf_mean": 0.8231604099273682, "conf_std": 0.07646501809358597,
    },
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {
        "ppl_mean": 16.57339096069336,  "ppl_std": 50.37682342529297,
        "conf_mean": 0.6976740956306458, "conf_std": 0.10360505431890488,
    },
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": {
        "ppl_mean": 8.22177505493164,  "ppl_std": 14.440741539001465,
        "conf_mean": 0.7438507676124573, "conf_std": 0.0863514393568039,
    },
}

data_name = "aime24"

# 需处理的数据 (一行一个问题，预测明细、输出路径)
DATASET = f"data/{data_name}.json"

MODEL_SPECS: Dict[str, Dict[str, str]] = {
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {
        "acc_file": f"output/{data_name}/deepseek-r1-1.5b-generated-predictions-detailed-results.jsonl",
        "out_file": f"z_score/{data_name}/deepseek-r1-1.5b_ppl_conf_acc_z_scores_results.json",
    },
    "Qwen/Qwen3-4B": {
        "acc_file": f"output/{data_name}/qwen3-4b-generated-predictions-detailed-results.jsonl",
        "out_file": f"z_score/{data_name}/qwen3-4b_ppl_conf_acc_z_scores_results.json",
    },
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B": {
        "acc_file": f"output/{data_name}/deepseek-r1-7b-generated-predictions-detailed-results.jsonl",
        "out_file": f"z_score/{data_name}/deepseek-r1-7b_ppl_conf_acc_z_scores_results.json",
    },
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B": {
        "acc_file": f"output/{data_name}/deepseek-r1-14b-generated-predictions-detailed-results.jsonl",
        "out_file": f"z_score/{data_name}/deepseek-r1-14b_ppl_conf_acc_z_scores_results.json",
    },
}

# ────────────────────────────────────────────────────────────────────────────────
# 2. 公用函数
# ────────────────────────────────────────────────────────────────────────────────
def compute_ppl_conf(model, tokenizer, text: str) -> Dict[str, float]:
    """return dict(ppl, conf)  – conf = 平均 token-max-prob"""
    enc = tokenizer(text, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        out = model(**enc, labels=enc["input_ids"])
        loss   = out.loss
        logits = out.logits

    ppl = math.exp(loss.item())
    probs = torch.softmax(logits, dim=-1)
    maxp  = probs.max(dim=-1).values[0]          # shape = (seq_len,)
    # 排除 BOS / EOS（假设 tokenizer.eos_token_id 存在；如果无 eos_id 可删掉最后一个 token）
    bos, eos = 0, -1
    if tokenizer.bos_token_id is None: bos = None
    if tokenizer.eos_token_id is None: eos = None
    sel = maxp[1:eos] if bos is not None else maxp
    conf = sel.mean().item()
    return {"ppl": ppl, "conf": conf}

# ────────────────────────────────────────────────────────────────────────────────
# 3. 主循环
# ────────────────────────────────────────────────────────────────────────────────
with open(DATASET, "r") as f:
    dataset = json.load(f)                   # 每条含 "input"

logger.info(f"Loaded {len(dataset)} problems from {DATASET}")

for model_name, paths in tqdm(MODEL_SPECS.items(), desc="Models"):
    logger.info(f"\n── Processing [{model_name}] ──")
    # 3.1 读 ACC 明细
    with open(paths["acc_file"], "r") as f:
        acc_lines = [json.loads(x) for x in f]
    assert len(acc_lines) == len(dataset), "ACC file length mismatch"

    # 3.2 加载模型 / tokenizer
    torch_dtype = DTYPE16 if DTYPE16 is not None else None
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch_dtype,
        trust_remote_code=True,
    ).eval()

    # 3.3 批量计算
    stats   = MODEL_STATS[model_name]
    results = []
    for sample, acc_rec in tqdm(zip(dataset, acc_lines), total=len(dataset), leave=False):
        acc = 1 if acc_rec.get("accuracy", 0.0) >= 99.9 else 0
        try:
            m = compute_ppl_conf(model, tokenizer, sample["input"])
            z_ppl  = (m["ppl"]  - stats["ppl_mean"])  / stats["ppl_std"]
            z_conf = (m["conf"] - stats["conf_mean"]) / stats["conf_std"]
            results.append({"z_ppl": z_ppl, "z_conf": z_conf, "acc": acc})
        except Exception as e:                          # 捕获单条异常
            results.append({"error": str(e), "acc": acc})

    # 3.4 保存
    os.makedirs(os.path.dirname(paths["out_file"]), exist_ok=True)
    with open(paths["out_file"], "w") as f:
        json.dump(results, f, indent=2)
    logger.info(f"✅ Saved {len(results)} records to {paths['out_file']}")

    # 3.5 释放显存
    del model; del tokenizer; torch.cuda.empty_cache(); gc.collect()

logger.info("🟢 All models finished.")
