In [5]:
# 修改llm_steps 生成的格式
# ✅ 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import re

# ✅ 设置 Google Drive 中的路径（修改成你自己文件所在的文件夹路径）
FOLDER_PATH = "/content/drive/MyDrive/Cluster-proj/output/llm_steps/original/"  # 👈 修改这个路径

# ✅ 提取 final answer（用于 student sampling）
def extract_final_answer(ans_str):
    match = re.search(r'\bAnswer\s*[:：]?\s*(.*?)\s*$', ans_str.strip(), re.IGNORECASE | re.MULTILINE)
    return match.group(1).strip() if match else None

# ✅ 提取 true final result（用于 gold answer）
def extract_true_final_result(ans_str):
    match = re.search(r'####\s*(\S+)', ans_str)
    return match.group(1).strip() if match else None

# ✅ 主处理函数
def process_file(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    results = {}

    for qid, content in data.items():
        results[qid] = {}

        # ✅ 处理 gold answer
        if "true_ans" in content:
            results[qid]["question"] = content["question"]
            true_ans = content["true_ans"]
            results[qid]["true_whole_answer"] = true_ans
            results[qid]["true_final_result"] = extract_true_final_result(true_ans)


        # ✅ 处理每个 sampling{i}
        for i in range(3):
            samp_key = f"sampling{i}"
            step_prob_key = f"{samp_key}_step_token_probs"

            if samp_key not in content:
                continue

            full_answer = content[samp_key].get("answer", "")
            token_probs = content[samp_key].get("token_probs", [])
            step_token_probs = content.get(step_prob_key, {})

            results[qid][samp_key] = {
                "whole_answer": full_answer,
                "token_probs": token_probs,
                "step_token_probs": step_token_probs,
                "final_result": extract_final_answer(full_answer)
            }

    # 💾 保存到新文件
    out_path = file_path.replace(".json", "_reformatted.json")
    with open(out_path, "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"✅ Processed: {file_path} → {out_path}")

# ✅ 批量处理文件夹中所有 JSON 文件（前三个）
all_json_files = [os.path.join(FOLDER_PATH, f) for f in sorted(os.listdir(FOLDER_PATH)) if f.endswith(".json")]
for file_path in all_json_files[:3]:  # 只处理前三个
    process_file(file_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Processed: /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-201-300_with_steps.json → /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-201-300_with_steps_reformatted.json
✅ Processed: /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-301-400_with_steps.json → /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-301-400_with_steps_reformatted.json
✅ Processed: /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-401-600_with_steps.json → /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-401-600_with_steps_reformatted.json


In [6]:
import json
import os

# ✅ 文件配置
BASE_PATH = "/content/drive/MyDrive/Cluster-proj/output/llm_steps/original/"
RANGES = [(201, 300), (301, 400), (401, 600)]

# ✅ 遍历每个文件范围
for start_index, end_index in RANGES:
    range_tag = f"{start_index}-{end_index}"
    input_path = os.path.join(BASE_PATH, f"deepseek7b-math-{range_tag}_with_steps_reformatted.json")

    # 加载原始数据
    with open(input_path, 'r') as f:
        logits_data = json.load(f)

    # 重新编号
    new_logits_data = {}
    for i, (old_key, value) in enumerate(logits_data.items()):
        new_key = f'q_{start_index + i}'
        new_logits_data[new_key] = value

    # 保存重命名后的数据
    output_path = input_path.replace(".json", f"_renumbered.json")
    with open(output_path, 'w') as f:
        json.dump(new_logits_data, f, indent=2)

    print(f"✅ {input_path} 重新编号并保存到 {output_path}")


✅ /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-201-300_with_steps_reformatted.json 重新编号并保存到 /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-201-300_with_steps_reformatted_renumbered.json
✅ /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-301-400_with_steps_reformatted.json 重新编号并保存到 /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-301-400_with_steps_reformatted_renumbered.json
✅ /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-401-600_with_steps_reformatted.json 重新编号并保存到 /content/drive/MyDrive/Cluster-proj/output/llm_steps/original/deepseek7b-math-401-600_with_steps_reformatted_renumbered.json
