In [None]:
import os
os.chdir("../")

In [None]:
import collections
import json
from pathlib import Path
from types import SimpleNamespace

import click
import numpy as np
import pandas as pd
import tqdm

In [None]:
def read_results_jsonl(results_jsonl):
    results_jsonl = Path(results_jsonl)

    ds_metrics_records = []
    with open(results_jsonl, "r") as f:
        for line in (f):
            data = json.loads(line)

            dataset_name = data['dataset_name']
            average_num_correct = data['average_num_correct']
            pass_at_num_rollouts = data['pass_at_num_rollouts']
            majority_at_num_rollouts = data['majority_at_num_rollouts']

            ds_metrics_records.append({
                "dataset_name": dataset_name,
                "average_num_correct": average_num_correct,
                "majority_at_num_rollouts": majority_at_num_rollouts,
                "pass_at_num_rollouts": pass_at_num_rollouts,
            })

    df = pd.DataFrame(ds_metrics_records)
    df_mean = df.groupby("dataset_name").mean().reset_index()
    # convert it to percentage
    df_mean['average_num_correct'] = df_mean['average_num_correct'] * 100
    df_mean['pass_at_num_rollouts'] = df_mean['pass_at_num_rollouts'] * 100
    df_mean['majority_at_num_rollouts'] = df_mean['majority_at_num_rollouts'] * 100
    model_name = results_jsonl.parent.name
    df_mean['model_name'] = model_name
    return df_mean


In [None]:
base_dir = "outputs/greedy"
base_dir = Path(base_dir)
# get all regraded_eval_results.jsonl
def get_all_results_jsonl(base_dir):
    results_jsonl_files = list(base_dir.glob("**/regraded_eval_results.jsonl"))
    if not results_jsonl_files:
        raise ValueError(f"No regraded_eval_results.jsonl files found in {base_dir}")
    return results_jsonl_files

results_jsonl_files = get_all_results_jsonl(base_dir)
results_jsonl_files[0]

In [None]:
df_list = []
for results_jsonl in tqdm.tqdm(results_jsonl_files):
    df = read_results_jsonl(results_jsonl)

    # outputs/greedy/v0/7b-m23k-checkpoint-4401/regraded_eval_results.jsonl
    version = results_jsonl.parent.parent.name
    df["version"] = version
    df_list.append(df)

In [None]:
df_all = pd.concat(df_list, ignore_index=True)

mean_std_records = []
# get mean and std, group by model_name, dataset_name
for group, sub_df in df_all.groupby(["model_name", "dataset_name"]):
    mean = sub_df["average_num_correct"].mean()
    std = sub_df["average_num_correct"].std()
    mean_std_records.append({
        "model_name": group[0],
        "dataset_name": group[1],
        "mean_average_num_correct": mean,
        "std_average_num_correct": std,
    })

In [None]:
mean_std_df = pd.DataFrame(mean_std_records)
mean_std_df


In [None]:
def reorder_rows(df, rows, append_unseen_rows=False):
    """
    Reorder the rows of a DataFrame based on a given list of row names.
    """
    # append unseen rows to the end
    if append_unseen_rows:
        unseen_rows =  df.index.difference(rows)
        rows.extend(unseen_rows)
    df = df.reindex(rows)
    return df


In [None]:
# convert mean_std_df (model_name, dataset_name, mean_average_num_correct) to a pivot table
mean_df = mean_std_df.pivot_table(
    index="model_name",
    columns="dataset_name",
    values="mean_average_num_correct",
)
rows = [
    # 3b
    "Qwen2.5-VL-3B-Instruct",
    "3b-m23k-checkpoint-4401",
    "3b-pmc_vqa-checkpoint-12594",
    "train-qwen2_5_vl_3b-pmc_vqa-m23k_sft_epoch_3-step_1150",
    "train-qwen2_5_vl_3b-m23k-step_320",
    "train-qwen2_5_vl_3b-pmc_vqa-step_451",
    "train-qwen2_5_vl_3b-pmc_vqa-m23k_rl-step_1805",
    # 7b
    "Qwen2.5-VL-7B-Instruct",
    "7b-m23k-checkpoint-4401",
    "7b-pmc_vqa-checkpoint-12594",
    "train-qwen2_5_vl_7b-pmc_vqa-m23k_sft_epoch_3-step_1805",
    "train-qwen2_5_vl_7b-m23k-step_320",
    "train-qwen2_5_vl_7b-pmc_vqa-step_451",
    "train-qwen2_5_vl_7b-pmc_vqa-m23k_rl-step_1805",
    # 
    "HuatuoGPT-Vision-7B-Qwen2.5VL",
    "Qwen2.5-VL-32B-Instruct",
    "train-qwen2_5_vl_32b-m23k-step_645",
    # others
    "llava-med-v1.5-mistral-7b-hf",
    "HuatuoGPT-Vision-7B-hf",
    "HuatuoGPT-Vision-34B-hf",
    "medgemma-4b-it",
    "medgemma-27b-it",
    "gemma-3-4b-it",
    "gemma-3-27b-it",
]
mean_df = reorder_rows(mean_df, rows, False)
mean_df_path = "outputs/mean_average_num_correct.tsv"
mean_df.to_csv(mean_df_path, sep="\t")
print(f"Mean average number correct saved to {mean_df_path}")