In [4]:
# =================== 1. Mount Google Drive ===================
from google.colab import drive
drive.mount('/content/drive')

# =================== 2. Import Dependencies ===================
import json
import requests
from tqdm import tqdm

# =================== 3. Configurations ===================
API_KEY = "sk-gfTaF2NQcVamzCcH5REcYebW6ZyfWKmuUcamNSytoVO6RMhz"
BASE_URL = "https://vip-api-global.aiearth.dev/v1"
MODEL = "gpt-4.1"

INFERENCE_PATH = "/content/drive/MyDrive/Cluster-proj/output/deepseek7b-math-401-600_with_steps.json"
ANNOTATION_PATH = "/content/drive/MyDrive/Cluster-proj/output/1_inconsis_cases/inconsis_deepseek7b-math-401-600.json"
OUTPUT_PATH = "/content/drive/MyDrive/Cluster-proj/output/identify_error_step/step_comparison_deepseek7b-math-401-600.json"

# =================== 4. Load Files ===================
with open(INFERENCE_PATH, "r") as f:
    inference_data = json.load(f)

with open(ANNOTATION_PATH, "r") as f:
    annotations = json.load(f)

# =================== 5. API Call Function ===================
def call_gpt_api(correct_steps, wrong_steps):
    prompt = f"""You are an expert in step-by-step math reasoning analysis.

Below are two sets of reasoning steps to solve a math problem. One is correct and one is incorrect.

Your task:
1. Identify the NUMBER of the step in the WRONG reasoning that caused the final answer to be incorrect.
2. Identify the NUMBER of the corresponding correct step in the CORRECT reasoning that fixed the error.

Only output a JSON in this format:
{{
  \"error_step\": <number>,
  \"fixed_step\": <number>
}}

Correct reasoning:
{correct_steps}

Incorrect reasoning:
{wrong_steps}
"""

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    body = {
        "model": MODEL,
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=body)
    result = response.json()
    content = result["choices"][0]["message"]["content"]
    return json.loads(content)

# =================== 6. Main Comparison Loop ===================
output = {}

for qid, record in tqdm(annotations.items()):
    output[qid] = {
        "correct_sampling": {},
        "wrong_sampling": {}
    }

    for correct_id in record["correct_sampling"]:
        correct_steps = inference_data[qid][correct_id]["answer"]

        for wrong_id in record["wrong_sampling"]:
            wrong_steps = inference_data[qid][wrong_id]["answer"]

            try:
                result = call_gpt_api(correct_steps, wrong_steps)
                # Save error step under wrong_sampling
                output[qid]["wrong_sampling"].setdefault(wrong_id, {})
                output[qid]["wrong_sampling"][wrong_id][f"vs_{correct_id}"] = {"error_step": result["error_step"]}
                # Save corresponding fix step under correct_sampling
                output[qid]["correct_sampling"].setdefault(correct_id, {})
                output[qid]["correct_sampling"][correct_id][f"vs_{wrong_id}"] = {"fix_step": result["fixed_step"]}
            except Exception as e:
                output[qid]["wrong_sampling"].setdefault(wrong_id, {})
                output[qid]["wrong_sampling"][wrong_id][f"vs_{correct_id}"] = {"error": str(e)}

# =================== 7. Save Output ===================
with open(OUTPUT_PATH, "w") as f:
    json.dump(output, f, indent=2)

print("✅ Saved to:", OUTPUT_PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 69/69 [10:55<00:00,  9.50s/it]

✅ Saved to: /content/drive/MyDrive/Cluster-proj/output/identify_error_step/step_comparison_deepseek7b-math-401-600.json



