In [1]:
import os
os.chdir("../")

In [2]:
import collections
import json
from pathlib import Path
from types import SimpleNamespace

import click
import numpy as np
import pandas as pd

In [3]:
def main(input_dir, output_file):
    input_dir = Path(input_dir)

    # recursively find all `acc-eval_results.json`
    result_files = list(input_dir.rglob("acc-eval_results.json"))
    if not result_files:
        click.echo("No acc-eval_results.json files found.")
        return

    click.echo(f"Found {len(result_files)} acc-eval_results.json files.")
    all_results = collections.defaultdict(list)
    for file_path in result_files:
        click.echo(f"Parsing {file_path}")
        results = parse_acc_eval_results(file_path)

        for acc_type, acc_data in results.items():
            all_results[acc_type].append(acc_data)

    for acc_type in all_results:
        # Convert lists of dicts to DataFrame
        df = pd.DataFrame(all_results[acc_type])
        # Set the index to file_path
        df = df.set_index("file_path")
        # Sort by index
        df = df.sort_index()
        # add an column for average accuracy
        df["avg"] = df.mean(axis=1)
        # Rename columns to include the accuracy type
        all_results[acc_type] = df
    

    output_file = Path(output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    is_first = True
    for acc_type, output_df in all_results.items():
        mode = "w" if is_first else "a"
        is_first = False

        output_df.to_csv(output_file, sep="\t", mode=mode)
        click.echo(f"Results saved to {output_file}")

    return all_results

def parse_acc_eval_results(file_path):
    """Parse a single acc-eval_results.json file.
    {
    "accuracy_total": {
        "GBaker/MedQA-USMLE-4-options": 0.10385916359163591,
        "openlifescienceai/headqa": 0.08821770334928229,
        "openlifescienceai/medmcqa": 0.11620646593589566
    },
    "accuracy_pass@total": {
        "GBaker/MedQA-USMLE-4-options": 0.6783517835178352,
        "openlifescienceai/headqa": 0.5980861244019139,
        "openlifescienceai/medmcqa": 0.7236020535590398
    },
    "total_num_rollouts": 375296,
    "total_num_correct": 43197,
    "num_samples": 23456
    }
    """
    with open(file_path, "r") as f:
        data = json.load(f)

    parsed_data_dict = {}
    for acc_type, acc_dict in data.items():
        parsed_data = {}
        if not acc_type.startswith("accuracy_"):
            continue
        for dataset_name, acc_value in acc_dict.items():
            key = dataset_name
            parsed_data[key] = acc_value
        parsed_data["file_path"] = str(file_path)
        parsed_data_dict[acc_type] = parsed_data

    return parsed_data_dict


In [4]:
all_results_list = []
all_results_list.append(main("outputs/greedy/v0", "outputs/results-greedy-v0.tsv"))
all_results_list.append(main("outputs/greedy/v1", "outputs/results-greedy-v1.tsv"))
all_results_list.append(main("outputs/greedy/v2", "outputs/results-greedy-v2.tsv"))

Found 31 acc-eval_results.json files.
Parsing outputs/greedy/v0/7b-m23k-checkpoint-4401/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_7b-pmc_vqa-m23k_sft_epoch_5-step_800/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_300/acc-eval_results.json
Parsing outputs/greedy/v0/7b-m23k-checkpoint-7335/acc-eval_results.json
Parsing outputs/greedy/v0/3b-m23k-checkpoint-4401/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_7b-pmc_vqa-m23k_rl-step_900/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_200/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_3b-pmc_vqa-m23k_rl-step_1805/acc-eval_results.json
Parsing outputs/greedy/v0/Qwen2.5-VL-7B-Instruct/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_200/acc-eval_results.json
Parsing outputs/greedy/v0/train-qwen2_5_vl_3b-pmc_vqa-m23k_sft_epoch_3-step_1150/acc-eval_re

In [5]:
all_results_list[0].keys()
(all_results_list[0]["accuracy_total"] == all_results_list[0]["accuracy_pass@total"]).all()

MMMU-medical      True
MedXpertQA-MM     True
pathvqa_closed    True
pmc_vqa           True
slake_closed      True
vqa_rad_closed    True
avg               True
dtype: bool

In [6]:
all_results_list = [i["accuracy_total"] for i in all_results_list]

In [7]:
import re
# remove "outputs/greedy/v*/" prefix and "/acc-eval_results.json" suffix in index
for i in all_results_list:
    i.index = i.index.str.replace(r"outputs/greedy/v\d+/", "", regex=True)
    i.index = i.index.str.replace(r"/acc-eval_results.json", "", regex=True)


In [8]:
stacked = pd.concat(all_results_list, keys=range(len(all_results_list)))
mean_df = stacked.groupby(level=1).mean()
std_df = stacked.groupby(level=1).std()

In [9]:
# display(stacked)
# display(mean_df)
# display(std_df)

In [10]:
# reorder the index
def reorder_index(df):
    index_order = [
    "Qwen2.5-VL-3B-Instruct",
    "3b-m23k-checkpoint-4401",
    "3b-m23k-checkpoint-7335",
    "train-qwen2_5_vl_3b-m23k-step_320",
    "train-qwen2_5_vl_3b-m23k-easy_to_hard-step_320",
    "train-qwen2_5_vl_3b-m23k-easy_to_hard-1_epoch-step_64",
    "train-qwen2_5_vl_3b-pmc_vqa-step_451",
    "train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_100",
    "train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_200",
    "train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_300",
    "train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_400",
    "train-qwen2_5_vl_3b-pmc_vqa-easy_to_hard-step_451",
    "train-qwen2_5_vl_3b-pmc_vqa-m23k_sft_epoch_3-step_1150",
    "train-qwen2_5_vl_3b-pmc_vqa-m23k_sft_epoch_5-step_1400",
    "train-qwen2_5_vl_3b-pmc_vqa-m23k_rl-step_1805",
    "Qwen2.5-VL-7B-Instruct",
    "7b-m23k-checkpoint-4401",
    "7b-m23k-checkpoint-7335",
    "train-qwen2_5_vl_7b-m23k-step_320",
    "train-qwen2_5_vl_7b-m23k-easy_to_hard-step_320",
    "train-qwen2_5_vl_7b-m23k-easy_to_hard-1_epoch-step_64",
    "train-qwen2_5_vl_7b-pmc_vqa-step_451",
    "train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_100",
    "train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_200",
    "train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_300",
    "train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_400",
    "train-qwen2_5_vl_7b-pmc_vqa-easy_to_hard-step_500",
    "train-qwen2_5_vl_7b-pmc_vqa-m23k_sft_epoch_3-step_1805",
    "train-qwen2_5_vl_7b-pmc_vqa-m23k_sft_epoch_5-step_800",
    "train-qwen2_5_vl_7b-pmc_vqa-m23k_rl-step_900",
    ]
    # get unused indices and append them to the end
    unused_indices = df.index.difference(index_order)
    index_order.extend(unused_indices.tolist())
    df = df.reindex(index_order)
    return df

In [11]:
mean_pct = mean_df * 100
std_pct = std_df * 100

# save mean
output_file = Path("outputs/results-greedy-mean.tsv")
reorder_index(mean_pct).to_csv(output_file, sep="\t")
print(f"Mean results saved to {output_file}")


# Format mean and std as strings with 2 decimal places
mean_str = mean_pct.applymap(lambda x: f"{x:.2f}")
std_str = std_pct.applymap(lambda x: f"{x:.2f}")

# Combine as "mean±std"
combined = mean_str + " ± " + std_str
combined = reorder_index(combined)

# save
output_file = Path("outputs/results-greedy-combined.tsv")
combined.to_csv(output_file, sep="\t")
print(f"Results saved to {output_file}")

Mean results saved to outputs/results-greedy-mean.tsv
Results saved to outputs/results-greedy-combined.tsv


  mean_str = mean_pct.applymap(lambda x: f"{x:.2f}")
  std_str = std_pct.applymap(lambda x: f"{x:.2f}")
