In [1]:
from datasets import load_dataset
dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")

MODEL_NAME_MAPPING = {
    "DeepSeek-R1-Distill-Llama-8B": "llama-8B",
    "DeepSeek-R1-Distill-Qwen-7B": "qwen-7B",
    "DeepSeek-R1-Distill-Qwen-1.5B": "qwen-1p5B",
}

def find_dataset_sample(id: str) -> str:
    for item in dataset:
        if item["unique_id"] == id:
            return item

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import re

def extract_boxed_answer(response):
    # Find all \boxed{...} matches
    # This pattern will match nested braces within \boxed{} properly
    matches = re.findall(r'\\boxed{([^{}]*(?:\{[^{}]*\}[^{}]*)*)}', response)
    # If matches found, return the last one, otherwise return None
    if matches:
        return matches[-1]
    return None

In [3]:
from utils.MATH_500_eval.grader import grade_answer

EOS = '<｜end▁of▁sentence｜>'
THINK_OPEN = '<think>'
THINK_CLOSE = '</think>'
THINK_BLOCK_RE = r'<think>.*?</think>'
THINK_BLOCK_GROUPED_RE = r'(<think>)(.*?)(</think>)'
MASK = '\nOkay, I think I have finished thinking.\n'

def process_traces(traces, filename):
    res = []
    parts = filename.split("_")
    model_name = MODEL_NAME_MAPPING[parts[1]]

    for trace in traces:
        sample = find_dataset_sample(trace["unique_id"])
        gold_answer = sample["answer"]
        response = trace["response"].strip()

        # 1) must end with EOS
        if not response.endswith(EOS):
            continue

        # 2) must have exactly one <think> and exactly one </think>
        if response.count(THINK_OPEN) != 1 or response.count(THINK_CLOSE) != 1:
            continue

        # 3) extract and validate final answer
        extracted_answer = extract_boxed_answer(response)
        is_correct = int(grade_answer(extracted_answer, gold_answer))
        if is_correct == 0:
            continue

        # 4) find the single think block
        m = re.search(THINK_BLOCK_RE, response, flags=re.DOTALL)
        if not m:
            continue

        # 4) truncate everything after </think> and append the gold answer line
        res_withR = response[:m.end()] + r"The final answer is: \boxed{" + extracted_answer + "}"

        # 5) mask the inner think content
        res_withoutR = re.sub(
            THINK_BLOCK_GROUPED_RE,
            r'\1' + MASK + r'\3',
            res_withR,
            flags=re.DOTALL
        )

        res.append({
            "model_name": model_name,
            "unique_id": trace["unique_id"],
            "problem": sample["problem"],
            "response_withR": res_withR,
            "response_withoutR": res_withoutR,
            "subject": sample["subject"],
            "level": sample["level"]
        })
    print(f"Processed {len(res)} traces out of {len(traces)} for {model_name}")
    return res

In [4]:
import os
import json

results = []
# loop through all jsonl files in the folder "data/traces"
for filename in os.listdir("data/traces"):
    if filename.endswith(".jsonl"):
        file_path = os.path.join("data/traces", filename)
        traces = []
        with open(file_path, 'r') as f:
            for line in f:
                traces.append(json.loads(line))
        results.extend(process_traces(traces, filename))

Processed 350 traces out of 500 for llama-8B
Processed 438 traces out of 500 for qwen-7B
Processed 369 traces out of 500 for qwen-1p5B


In [26]:
import pandas as pd

df = pd.DataFrame(results)
# keep only those IDs that have results for all models
filter = (
    df.groupby('unique_id')['model_name']
    .nunique()
    .pipe(lambda s: s[s == 3].index)
)
df_final = df[df['unique_id'].isin(filter)].copy()
# save to json
df_final.to_json("data/traces.jsonl", orient='records', lines=True)