In [None]:
import os
import json
import re
import requests
from tqdm import tqdm
from google.colab import drive

# ✅ 挂载 Google Drive
drive.mount('/content/drive')

# ========== ✅ CONFIGURATION ==========
start_index = 401
end_index = 600
range_tag = f"{start_index}-{end_index}"

BASE_PATH = "/content/drive/MyDrive/Cluster-proj/"
INFERENCE_PATH = os.path.join(BASE_PATH, f"output/llm_steps/original/deepseek7b-math-{range_tag}_with_steps_reformatted_renumbered.json")
modified_inference_output = os.path.join(BASE_PATH, f"output/llm_steps_acc/acc_deepseek7b-math-{range_tag}.json")
ANNOTATION_PATH = os.path.join(BASE_PATH, f"output/1_inconsis_cases/0524/inconsis_deepseek7b-math-{range_tag}.json")
COMPARISON_OUTPUT_PATH = os.path.join(BASE_PATH, f"output/identify_error_step/0524/step_comparison_deepseek7b-math-{range_tag}.json")
POSITIVE_PAIR_OUTPUT_PATH = os.path.join(BASE_PATH, f"output/consis_cases/positive_pairs_deepseek7b-math-{range_tag}.json")


API_KEY = "sk-gfTaF2NQcVamzCcH5REcYebW6ZyfWKmuUcamNSytoVO6RMhz"
BASE_URL = "https://vip-api-global.aiearth.dev/v1"
MODEL = "gpt-4.1"





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ========== ✅ UTILS ==========
def extract_true_final_result(ans_str):
    match = re.search(r'####\s*(\S+)', ans_str)
    return match.group(1).strip() if match else None

def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

In [None]:
import os
import json

# ========== ✅ STEP 1: FIND INCONSISTENT CASES ==========
def get_inconsistent_cases(data, output_path, modified_inference_output=None):
    inconsistent_cases = {}

    for qid in data:
        true_final_result = data[qid].get('true_final_result')
        if true_final_result is None:
            print(f"[WARNING] No true answer found in {qid}")
            continue

        correct, wrong = [], []

        for i in range(3):
            pred = data[qid].get(f'sampling{i}', {})
            pred['accuracy'] = ''

            final_result = pred.get("final_result", None)
            if final_result is None:
                print(f"[INFO] No final_result in sampling{i} of {qid}, skipping...")
                continue

            final_result = final_result.strip()

            if true_final_result in final_result:
                pred['accuracy'] = 'true'
                correct.append(f'sampling{i}')
            else:
                pred['accuracy'] = 'false'
                wrong.append(f'sampling{i}')

        if correct and wrong:
            inconsistent_cases[qid] = {
                "correct_sampling": correct,
                "wrong_sampling": wrong
            }

    # ✅ 保存 inconsistent cases
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(inconsistent_cases, f, indent=2)
    print(f"✅ Inconsistent cases saved to: {output_path}")

    # ✅ 可选：保存修改后的完整推理数据
    if modified_inference_output:
        with open(modified_inference_output, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"✅ Modified inference data saved to: {modified_inference_output}")

    return inconsistent_cases


In [None]:
def get_positive_pairs(data, output_path):
    positive_pairs = {}

    for qid in data:
        correct = []
        true_final_result = data[qid].get('true_final_result')
        if true_final_result is None:
            continue

        for i in range(3):
            pred = data[qid].get(f'sampling{i}')
            if pred is None:
                continue

            final_result = pred.get("final_result")
            if final_result is None:
                continue

            final_result = final_result.strip()
            if true_final_result in final_result:
                correct.append(f"sampling{i}")

        if len(correct) >= 2:
            # 构建所有两两组合
            pairs = [(correct[i], correct[j]) for i in range(len(correct)) for j in range(i+1, len(correct))]
            positive_pairs[qid] = {
                "positive_pairs": pairs
            }

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(positive_pairs, f, indent=2)
    print(f"✅ Positive-positive pairs saved to: {output_path}")

    return positive_pairs


In [None]:
def call_gpt_api(stu_steps_a, stu_steps_b, gold_steps):
    prompt = f"""You are an expert in math education. Your job is to evaluate step-by-step reasoning by comparing student answers.

You will be given:
- A gold standard answer that contains the correct steps and final answer.
- Two student-generated solutions: Student A and Student B.

Your task:
1. Determine whether each student's reasoning is actually correct or incorrect based on the gold standard.
2. If a student's reasoning is incorrect:
   - Set "accuracy" to false.
   - Identify the **first step (by order)** that deviates from the gold standard and leads to the final incorrect answer.
   - This should be the **earliest step** that causes the error, not just any incorrect step.
   - Set "error_step" to that step number.
   - Set "fix_step" to null.
3. If a student's reasoning is correct:
   - Set "accuracy" to true.
   - Set "error_step" to null.
   - Identify the NUMBER of the step in this student's reasoning that fixed the error made by the incorrect student.
   - Set "fix_step" to that number.

Return ONLY a JSON object in the following format:

{{
  "student_A": {{
    "accuracy": true or false,
    "error_step": NUMBER or null,
    "fix_step": NUMBER or null
  }},
  "student_B": {{
    "accuracy": true or false,
    "error_step": NUMBER or null,
    "fix_step": NUMBER or null
  }}
}}

Gold standard:
{gold_steps}

Student A's reasoning:
{stu_steps_a}

Student B's reasoning:
{stu_steps_b}
"""

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    body = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}]
    }

    try:
        response = requests.post(f"{BASE_URL}/chat/completions", headers=headers, json=body, timeout=60)
        result = response.json()
        content = result["choices"][0]["message"]["content"]
        return json.loads(content)
    except Exception as e:
        return {"error": str(e)}


In [None]:
# ========== ✅ STEP 3: COMPARE WITH VALIDATION ==========
def compare_steps(inference_data, inconsis_data, output_path):
    output = {}
    for qid, record in tqdm(inconsis_data.items(), desc="Comparing steps"):
        omitted = []
        output[qid] = {
            "correct_sampling": {},
            "wrong_sampling": {}
        }
        gold_steps = inference_data[qid]['true_whole_answer']

        for wrong_id in record["wrong_sampling"]:
          for correct_id in record["correct_sampling"]:
                suspected_correct = inference_data[qid][correct_id]["whole_answer"]
                suspected_wrong = inference_data[qid][wrong_id]["whole_answer"]

                result = call_gpt_api(suspected_wrong, suspected_correct, gold_steps)
                stu_a_acc = result.get("student_A",{}).get("accuracy","")
                stu_b_acc = result.get("student_B",{}).get("accuracy","")

                if stu_a_acc == stu_b_acc:
                  omitted.append(qid)
                  continue
                if (not stu_a_acc) and stu_b_acc:
                  is_swapped = False
                if stu_a_acc and (not stu_b_acc):
                  is_swapped = True

                # Determine actual IDs
                true_correct_id = correct_id if not is_swapped else wrong_id
                true_wrong_id = wrong_id if not is_swapped else correct_id

                # Get fix/error steps based on swap
                fix_step = result["student_A"]["fix_step"] if is_swapped else result["student_B"]["fix_step"]
                error_step = result["student_B"]["error_step"] if is_swapped else result["student_A"]["error_step"]

                # Fill wrong_sampling
                output[qid]["wrong_sampling"].setdefault(true_wrong_id, {})
                output[qid]["wrong_sampling"][true_wrong_id][f"vs_{true_correct_id}"] = {
                    "error_step": error_step,
                    "modified": is_swapped
                }
                # Fill correct_sampling
                output[qid]["correct_sampling"].setdefault(true_correct_id, {})
                output[qid]["correct_sampling"][true_correct_id][f"vs_{true_wrong_id}"] = {
                    "fix_step": fix_step,
                    "modified": is_swapped
                }
        output['omitted']= omitted


    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w') as f:
        json.dump(output, f, indent=2)
    print(f"✅ Step comparison saved to: {output_path}")



In [None]:
if __name__ == "__main__":
    inference_data = load_json(INFERENCE_PATH)

    # Step 2: 提取正正样本
    get_positive_pairs(inference_data, POSITIVE_PAIR_OUTPUT_PATH)



✅ Positive-positive pairs saved to: /content/drive/MyDrive/Cluster-proj/output/consis_cases/positive_pairs_deepseek7b-math-401-600.json


In [None]:
# ========== ✅ STEP 4: RUN EVERYTHING ==========
if __name__ == "__main__":
    inference_data = load_json(INFERENCE_PATH)
    inconsis_data = get_inconsistent_cases(inference_data, ANNOTATION_PATH)
    compare_steps(inference_data, inconsis_data, COMPARISON_OUTPUT_PATH)

[INFO] No final_result in sampling0 of q_401, skipping...
[INFO] No final_result in sampling0 of q_402, skipping...
[INFO] No final_result in sampling0 of q_404, skipping...
[INFO] No final_result in sampling0 of q_408, skipping...
[INFO] No final_result in sampling1 of q_408, skipping...
[INFO] No final_result in sampling0 of q_409, skipping...
[INFO] No final_result in sampling0 of q_410, skipping...
[INFO] No final_result in sampling1 of q_410, skipping...
[INFO] No final_result in sampling2 of q_410, skipping...
[INFO] No final_result in sampling1 of q_412, skipping...
[INFO] No final_result in sampling0 of q_413, skipping...
[INFO] No final_result in sampling0 of q_414, skipping...
[INFO] No final_result in sampling1 of q_414, skipping...
[INFO] No final_result in sampling0 of q_415, skipping...
[INFO] No final_result in sampling0 of q_417, skipping...
[INFO] No final_result in sampling1 of q_417, skipping...
[INFO] No final_result in sampling2 of q_418, skipping...
[INFO] No fina

Comparing steps: 100%|██████████| 59/59 [02:32<00:00,  2.58s/it]

✅ Step comparison saved to: /content/drive/MyDrive/Cluster-proj/output/identify_error_step/0524/step_comparison_deepseek7b-math-401-600.json



