In [1]:
from datasets import load_dataset
from utils import utils
import re

In [None]:
math_dataset = load_dataset("HuggingFaceH4/MATH-500", split="test")
math_dataset_ls = [dict(zip(math_dataset.column_names, values)) for values in zip(*math_dataset.to_dict().values())]

In [None]:
math_dataset_ls[0]

In [None]:
for curr in math_dataset_ls:
    curr_unique_id = curr["unique_id"]
    match = re.search(r'test/([^/]+)/(\d+)\.json', curr_unique_id)
    if match:
        domain = match.group(1)
        problem_id = match.group(2)
    curr["Name"] = f"math500_{domain}_{problem_id}"

In [None]:
utils.write_to_json("./data/Math-500.json", math_dataset_ls)

In [16]:
test_dataset = utils.read_json_in_folder("./NL_solver_results_ckpt/MATH-500")

In [17]:
len(test_dataset)

500

In [22]:
utils.write_to_json("./data/NL_answered_Math-500.json", test_dataset)

In [18]:
test_dataset[0]["solved_answer_results"][0]

{'solved_answer': ['\\dfrac{16', '\\dfrac{16', '\\dfrac{16'],
 'response': '<|im_start|>user\nSolve the following question Your answer should be in \\boxed\\{\\} format.\n\nHere is the question:\nSam is carrying a 2 gallon bucket of milk to his house from the barn, which is 3 miles away from his house. However, the bucket has a leak in it. For each mile he walks, there is $\\frac{2}{3}$ as much milk in the bucket as at the beginning of the mile. How many gallons of milk will be in the bucket when Sam gets home?<|im_end|>\n<|im_start|>assistant\n<think>\nOkay, so Sam has this 2-gallon bucket of milk, and he needs to carry it 3 miles from the barn to his house. But there\'s a leak in the bucket. The problem says that for each mile he walks, there\'s 2/3 as much milk in the bucket as at the beginning of the mile. I need to figure out how much milk is left when he gets home. Hmm, let me think about how to approach this.\n\nFirst, let me make sure I understand the problem correctly. The buc

In [19]:
test_str = test_dataset[0]["solved_answer_results"][0]["response"]
utils.extract_boxed(test_str)[-1]

'\\dfrac{16}{27}'

In [20]:
for i, curr in enumerate(test_dataset):
    updated_collected_answer = []
    for j, curr_collected_answer in enumerate(curr["solved_answer_results"]):
        curr_answers = utils.extract_boxed(curr_collected_answer["response"])
        if len(curr_answers) != 0:
            curr_collected_answer["solved_answer"] = curr_answers[-1]
            updated_collected_answer.append(curr_collected_answer)
    test_dataset[i]["solved_answer_results"] = updated_collected_answer

In [21]:
test_dataset[0]["solved_answer_results"]

[{'solved_answer': '\\dfrac{16}{27}',
  'response': '<|im_start|>user\nSolve the following question Your answer should be in \\boxed\\{\\} format.\n\nHere is the question:\nSam is carrying a 2 gallon bucket of milk to his house from the barn, which is 3 miles away from his house. However, the bucket has a leak in it. For each mile he walks, there is $\\frac{2}{3}$ as much milk in the bucket as at the beginning of the mile. How many gallons of milk will be in the bucket when Sam gets home?<|im_end|>\n<|im_start|>assistant\n<think>\nOkay, so Sam has this 2-gallon bucket of milk, and he needs to carry it 3 miles from the barn to his house. But there\'s a leak in the bucket. The problem says that for each mile he walks, there\'s 2/3 as much milk in the bucket as at the beginning of the mile. I need to figure out how much milk is left when he gets home. Hmm, let me think about how to approach this.\n\nFirst, let me make sure I understand the problem correctly. The bucket starts with 2 gallo

In [12]:
total_question_num = len(test_dataset)
total_pass_correct_num = 0
total_majority_correct_num = 0

wrong_answer_idx = []

for i, curr in enumerate(test_dataset):
    curr_answers = utils.collect_answers(curr["collected_answer_results"], answer_key="collected_answer")
    pass_correceness, majority_vote_correctness = utils.judge_correctness(curr_answers, curr["answer"])
    total_pass_correct_num += pass_correceness
    total_majority_correct_num += majority_vote_correctness
    if not pass_correceness:
        wrong_answer_idx.append(i)
print(f"Total number of questions: {total_question_num}")
print(f"Total number of questions passed: {total_pass_correct_num}")
print(f"Total number of questions majority voted: {total_majority_correct_num}")
print(f"Pass rate: {total_pass_correct_num / total_question_num}")
print(f"Majority vote rate: {total_majority_correct_num / total_question_num}")

Total number of questions: 500
Total number of questions passed: 400
Total number of questions majority voted: 366
Pass rate: 0.8
Majority vote rate: 0.732


In [15]:
test_dataset[1]["answer"]

'6r^2-4r-24'

In [16]:
test_dataset[1]["collected_answer_results"]

[{'collected_answer': '6r^2 - 4r - 24',
  'response_collected_answer_generation': "<|im_start|>user\nFind the answer to the following question in the provided long CoT content. Your answer should be in \\boxed\\{\\} format.\n\nHere is the question:\nSimplify $4(3r^3+5r-6)-6(2r^3-r^2+4r)$, and express your answer in the form $Ar^2 + Br + C$, where $A$, $B$, and $C$ are integers.\n\nHere is the long CoT content:\n\n# Simplifying an Algebraic Expression in Lean 4\n\nI'll work through the problem of simplifying the expression $4(3r^3+5r-6)-6(2r^3-r^2+4r)$ and express it in the form $Ar^2 + Br + C$ with integer coefficients.\n\nFirst, let's think about how we would solve this by hand. We need to expand each term and then collect like terms.\n\n1. Expand $4(3r^3 + 5r - 6)$:\n   - $4 \\cdot 3r^3 = 12r^3$\n   - $4 \\cdot 5r = 20r$\n   - $4 \\cdot (-6) = -24$\n   - So we get $12r^3 + 20r - 24$\n\n2. Expand $6(2r^3 - r^2 + 4r)$:\n   - $6 \\cdot 2r^3 = 12r^3$\n   - $6 \\cdot (-r^2) = -6r^2$\n   -