### Extracting Result from log/

In [None]:
import os
import json
import numpy as np
from experiment.utils import score_math

dataset_names = ["math_level5", "aime24", "aime25", "zebra_puzzles", "mini_sudoku"]
dataset_range = ["0-681", "0-30", "0-30", "0-100", "0-100"]

## Extract the result from the log file
model = "gpt-5-mini"
root_path = "log/{model_name}".format(model_name=model)

def calculate_accuracy(results, dataset_name):
    return sum(x["score"] for x in results) / len(results)

print_log = []

for i, dataset_name in enumerate(dataset_names):
    print_log.append("-"*100)
    print_log.append(f"{dataset_name}:")
    path = os.path.join(root_path, dataset_name)
    for method in os.listdir(path):
        method_path = os.path.join(path, os.path.join(method, dataset_range[i]))
        accuracies = []
        try:
            for file_name in os.listdir(method_path):
                if file_name.endswith(".json") and "score" not in file_name:
                    with open(os.path.join(method_path, file_name), "r") as f:
                        results = json.load(f)
                        accuracies.append(calculate_accuracy(results, dataset_name))
            # print_log.append(f"{method}: {np.mean(accuracies)*100:.2f} ({np.std(accuracies)*100:.2f}), Support of Runs: {len(accuracies)}")
            print_log.append(f"({len(accuracies)}) {method}: & {np.mean(accuracies)*100:.2f}\\scriptsize" + "{$\\pm$" + f"{np.std(accuracies)*100:.2f}" + "}")
        except Exception as e:
            print_log.append(f"{method}: No result")

for log in print_log:
    print(log)

#### Self-Consistency Evaluation

In [None]:
print_log = []
from experiment.utils import score_math, score_mc, majority_vote_answers, sudoku_match_score, exact_match_score
import random

method_names_for_sc = ["cot", "self-refine", "atom", "mctsr", "ssr", "ssr-adaptive", "ssr-planning"]
dataset_names = ["math_level5", "aime24", "aime25", "zebra_puzzles", "mini_sudoku"]
dataset_range = ["0-681", "0-30", "0-30", "0-100", "0-100"]

def reasoning_gym_score(prediction, ground_truth, dataset_name):
    if dataset_name == "mini_sudoku":
        return sudoku_match_score(prediction, ground_truth)
    elif dataset_name == "zebra_puzzles":
        return exact_match_score(prediction, ground_truth)

for i, dataset_name in enumerate(dataset_names):
    if dataset_name == "cryptarithm":
        def majority_vote_function(answers):
            def extract_dict(answer):
                return {k.strip(): v.strip() for k, v in [row.split("=")[:2] for row in answer.split(",") if "=" in row]}
            # a majorigy vote function for the extracted dict
            # two dicts are the same if they have the same keys and values
            # majority vote for the extracted dict
            def majority_vote_dict(dicts):
                from collections import Counter
                # Convert dicts to tuples of sorted items for counting
                dict_tuples = [tuple(sorted(d.items())) for d in dicts]
                most_common_tuple, _ = Counter(dict_tuples).most_common(1)[0]
                # Convert the tuple back to dict
                return {tup[0]: tup[1] for tup in most_common_tuple}
            def dict_to_string(dict):
                return ",".join([f"{k}={v}" for k, v in dict.items()])
            return dict_to_string(majority_vote_dict([extract_dict(answer) for answer in answers]))
    else:
        majority_vote_function = majority_vote_answers
    print_log.append("-"*100)
    print_log.append(f"{dataset_name}:")
    path = os.path.join(root_path, dataset_name)
    if dataset_name == "gpqa":
        score_function = score_mc
    elif dataset_name in ["cryptarithm", "mini_sudoku", "zebra_puzzles"]:
        score_function = reasoning_gym_score
    else:
        score_function = score_math
    for method in method_names_for_sc:
        method_path = os.path.join(path, os.path.join(method, dataset_range[i]))

        try:
            rollouts = []
            for file_name in os.listdir(method_path):
                if file_name.endswith(".json") and "score" not in file_name:
                    with open(os.path.join(method_path, file_name), "r") as f:
                        results = json.load(f)
                        # accuracies.append(calculate_accuracy_sc(results, dataset_name))
                        rollouts.append(results)
            # bootstrapping 5 results from rollouts and calculate the accuracy and std
            accuracies = []
            for _ in range(50):
                bootstrap_rollouts = random.sample(rollouts, 5)
                correct, total = 0, 0
                for j in range(len(bootstrap_rollouts[0])):
                    gt = bootstrap_rollouts[0][j]["groundtruth"]
                    answers = [rollout[j]["answer"] for rollout in bootstrap_rollouts]
                    final_answer = majority_vote_function(answers)
                    if dataset_name == "gpqa":
                        if score_mc(final_answer, gt) == 1:
                            correct += 1
                    else: 
                        correct += score_function(final_answer, gt, dataset_name)
                    total += 1
                accuracies.append(correct/total)
            print_log.append(f"({len(accuracies)}) {method}: & {np.mean(accuracies)*100:.2f}\\scriptsize" + "{$\\pm$" + f"{np.std(accuracies)*100:.2f}" + "}")


        # print_log.append(f"{method}: {(correct / total)*100:.2f} ({0:.2f}), Support of Runs: {len(accuracies)}")
        except Exception as e:
            print("Error: ", e)
            print_log.append(f"{method}: No result")

for log in print_log:
    print(log)
