In [None]:
import json
import os
import requests
from difflib import SequenceMatcher

# ✅ Your custom API config
API_KEY = "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"
MODEL = "gpt-4.1"

# ✅ Paths
start_index = 700
end_index = 731
range_tag = f"{start_index}-{end_index}"

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

BASE_PATH = "/content/drive/MyDrive/Cluster-proj"
LOGITS_PATH = f"{BASE_PATH}/output/llm_steps/whole_logits/deepseek7b-gsm-{range_tag}-hidden.json"

# ✅ Prompt builder
def build_error_prompt(question, true_whole_answer, sample_whole_answer):
    return f"""
Here is a math problem, its correct answer, and a sample answer that may contain mistakes.

【Question】:
{question}

【Correct Answer】:
{true_whole_answer}

【Incorrect Answer】:
{sample_whole_answer}

Please help me:
1. Identify the earliest mistake in the incorrect answer and provide the compelete sentence from that point.
2. Briefly explain why it is incorrect.

Please output in the following JSON format:
{{
  "first_error_sentence": "<sentence>",
  "error_reason": "<brief explanation>"
}}
"""



# ✅ Call your custom GPT API
def call_custom_gpt_api(prompt):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": "You are a meticulous and precise comparer."},
            {"role": "user", "content": prompt}
        ]
    }
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(f"API request failed: {response.status_code}, {response.text}")
    return response.json()["choices"][0]["message"]["content"]



In [9]:
import re
# ✅ 匹配函数：返回片段起止 token index
def find_sentence_span_indices_robust(fragment, token_probs):
    """
    返回 fragment 在 token_probs 中匹配到的 token 范围: (begin_index, end_index)
    - 使用去除空白字符的方式匹配
    """
    fragment_clean = re.sub(r"\s+", "", fragment)
    tokens = [entry["token"] for entry in token_probs]
    decoded_text = "".join(tokens)
    decoded_text_clean = re.sub(r"\s+", "", decoded_text)

    char_start_idx = decoded_text_clean.find(fragment_clean)
    if char_start_idx == -1:
        return -1, -1

    cumulative_len = 0
    begin_index = -1
    for idx, entry in enumerate(token_probs):
        token_clean = re.sub(r"\s+", "", entry["token"])
        prev_len = cumulative_len
        cumulative_len += len(token_clean)

        if begin_index == -1 and cumulative_len > char_start_idx:
            begin_index = idx
        if cumulative_len >= char_start_idx + len(fragment_clean):
            end_index = idx
            return begin_index, end_index

    return begin_index, len(token_probs) - 1  # fallback

In [None]:

# ✅ Load logits_data
with open(LOGITS_PATH, "r") as f:
    logits_data = json.load(f)


In [6]:
results = {}

# ✅ 主循环
for qid, sample in logits_data.items():
    question = sample["question"]
    true_final_result = sample["true_final_result"]
    true_whole_answer = sample["true_whole_answer"]

    for sampling_id in ["sampling0", "sampling1", "sampling2"]:
        if sampling_id not in sample:
            continue
        sampling = sample[sampling_id]
        if sampling["final_result"] == true_final_result:
            continue  # 正确答案跳过

        sample_whole_answer = sampling["whole_answer"]

        # 构造 prompt 并调用 API
        prompt = build_error_prompt(question, true_whole_answer, sample_whole_answer)
        output = call_custom_gpt_api(prompt)
        print(f"\n🔍 {qid} / {sampling_id}:\n{output}")

        # 去除可能的 ''' 包裹
        output = output.strip().strip("")
        if output.startswith("json"):
          output = output[4:].strip()
        # 解析 JSON
        try:
            output_json = json.loads(output)
            sentence = output_json["first_error_sentence"]
            error_reason = output_json["error_reason"]
        except Exception as e:
            print(f"⚠️ JSON parsing failed: {e}")
            sentence = ""
            error_reason = output

        # ✅ 保存结果（字段名为 sentence）
        if qid not in results:
            results[qid] = {}
        results[qid][sampling_id] = {
            "first_error_sentence": sentence,
            "error_reason": error_reason,
        }


🔍 q_700 / sampling0:
{
  "first_error_sentence": "To find the cost of a single brown sock, divide the 45 cents cost of two white socks by 2: 45 cents / 2 = 22.5 cents.",
  "error_reason": "This incorrectly assumes that two white socks cost the same as two brown socks, ignoring the 25 cent difference; the cost of a brown sock should be calculated as 45 cents minus 25 cents."
}

🔍 q_700 / sampling2:
{
  "first_error_sentence": "Let's denote the cost of a single brown sock as B, and the cost of two white socks as W.",
  "error_reason": "Here, the error is in the notation: W is defined as the cost of two white socks, but later used as both the cost of two and one white sock, risking confusion and improper equation setup. The problem is solved correctly, but a more precise variable definition (e.g., using w for one white sock and 2w for two) would avoid ambiguity."
}

🔍 q_701 / sampling1:
{
  "first_error_sentence": "Please enter your response here.",
  "error_reason": "The response is a p

In [10]:


# ✅ 第二轮遍历，补充 token index（不重新调用 API）
for qid, sample_data in results.items():
    for sampling_id, info in sample_data.items():
        sentence = info["first_error_sentence"]
        token_probs = logits_data[qid][sampling_id]["token_probs"]

        # 匹配 token index 范围
        begin_idx, end_idx = find_sentence_span_indices_robust(sentence, token_probs)

        # 加入到结果中
        info["first_error_token_index"] = begin_idx
        info["last_error_token_index"] = end_idx

        # 可选：打印检查
        print(f"{qid} / {sampling_id} → [{begin_idx}, {end_idx}] : {sentence}")

q_700 / sampling0 → [48, 87] : To find the cost of a single brown sock, divide the 45 cents cost of two white socks by 2: 45 cents / 2 = 22.5 cents.
q_700 / sampling2 → [30, 53] : Let's denote the cost of a single brown sock as B, and the cost of two white socks as W.
q_701 / sampling1 → [0, 5] : Please enter your response here.
q_703 / sampling1 → [269, 290] : Since each serving of guacamole requires 1 avocado, Georgie can make 9 servings of guacamole.
q_705 / sampling1 → [23, 66] : If she has $51 left, that means she spent half of her initial amount before buying the book. So, we can set up the equation:

$51 = (initial amount) / 2
q_705 / sampling2 → [123, 132] : Let's solve this equation step by step:
q_707 / sampling0 → [44, 57] : Melanie is going to put 4 bread pieces into the blender.
q_707 / sampling2 → [29, 39] : The number of pieces of bread after each step is:
q_708 / sampling0 → [28, 78] : The tap flow rate is 12 liters per minute, but 1 liter of water escapes per minute fr

In [11]:
output_dir = os.path.join(BASE_PATH, "output/error_index")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, f"{range_tag}_hidden_index.json")
with open(output_path, "w") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\n✅ 所有结果已保存到 {output_path}")



✅ 所有结果已保存到 /content/drive/MyDrive/Cluster-proj/output/error_index/700-731_hidden_index.json


In [None]:
#check


import json
import os
import re

# ✅ 路径配置
BASE_PATH = "/content/drive/MyDrive/Cluster-proj"
range_tag = "700-731"
LOGITS_PATH = f"{BASE_PATH}/output/llm_steps/whole_logits/deepseek7b-gsm-{range_tag}.json"
ERROR_INDEX_PATH = f"{BASE_PATH}/output/error_index/{range_tag}_sentence_with_index.json"

# ✅ 加载两个数据源
with open(LOGITS_PATH, "r") as f:
    logits_data = json.load(f)

with open(ERROR_INDEX_PATH, "r") as f:
    error_index_data = json.load(f)

# ✅ 遍历每一条，拼接 token & 比对句子
for qid, sample_data in error_index_data.items():
    for sampling_id, info in sample_data.items():
        sentence = info.get("first_error_sentence", "").strip()
        start = info.get("first_error_token_index", -1)
        end = info.get("last_error_token_index", -1)

        if start == -1 or end == -1:
            print(f"{qid} / {sampling_id} ❌ 缺失 index")
            continue

        token_probs = logits_data[qid][sampling_id]["token_probs"]
        tokens = [entry["token"] for entry in token_probs[start:end+1]]
        reconstructed = " ".join(tokens).strip()

        print("=" * 60)
        print(f"🔍 {qid} / {sampling_id}")
        print(f"📌 Token span [{start}, {end}]:\n{reconstructed}")
        print(f"\n📌 Error sentence:\n{sentence}")

        # 简单比对相似度
        reconstructed_clean = re.sub(r"\s+", "", reconstructed.lower())
        sentence_clean = re.sub(r"\s+", "", sentence.lower())
        match_status = "✅ MATCH" if reconstructed_clean == sentence_clean else "❌ DIFFERENT"

        print(f"\n🔎 比对结果: {match_status}\n")


🔍 q_700 / sampling0
📌 Token span [47, 78]:
If two white socks cost  4 5 cents , we can set up the equation  2 ( x +  0 . 2 5 ) =  4 5 .

📌 Error sentence:
If two white socks cost 45 cents, we can set up the equation 2(x + 0.25) = 45.

🔎 比对结果: ✅ MATCH

🔍 q_700 / sampling1
📌 Token span [19, 56]:
According to the given information , two white socks cost  2 5 cents more than a single brown sock , so the cost of two white socks is ( x +  2 5 ) cents .

📌 Error sentence:
According to the given information, two white socks cost 25 cents more than a single brown sock, so the cost of two white socks is (x + 25) cents.

🔎 比对结果: ✅ MATCH

🔍 q_700 / sampling2
📌 Token span [39, 56]:
2 * white _ sock _ cost +  2 5 cents = brown _ sock _ cost

📌 Error sentence:
2 * white_sock_cost + 25 cents = brown_sock_cost

🔎 比对结果: ✅ MATCH

🔍 q_701 / sampling0
📌 Token span [0, 37]:
Example Question : Sarah has  3 packs of pencils . Each pack contains  1 2 pencils . She gives away  1 5 pencils to her classmates . Ho