In [1]:
import pandas as pd
import numpy as np

In [2]:
model_name = "openrouter-qwen-qwq-32b"
api_name = "reasoning_api"
# model_name = "openrouter-microsoft-phi-4-reasoning-plus"
# api_name = "standard_api"

import os

# Only consider these columns (1-6) from gemini_acc.ipynb context
METRICS_COLS = [
    "average_solution_count",
    "filtered_ajd",
    "success_rates",
    "average_verification_rates",
    "overthinking_rates",
    "forgetting_rates",
]

def safe_read_csv(path):
    if os.path.exists(path):
        df = pd.read_csv(path)
        # Only keep the relevant columns if they exist
        existing = [c for c in METRICS_COLS if c in df.columns]
        return df[existing]
    else:
        print(f"File not found, skipping: {path}")
        return None

def get_results(model_name, api_name):
    paths = {
        "default_0": f"/home/zengyuchen/ReJump/results/{model_name}/game24_0_shot_1_query_{api_name}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25/metric_df.csv",
        "default_1": f"/home/zengyuchen/ReJump/results/{model_name}/game24_0_shot_1_query_{api_name}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25_1/metric_df.csv",
        "default_2": f"/home/zengyuchen/ReJump/results/{model_name}/game24_0_shot_1_query_{api_name}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25_2/metric_df.csv",
        "shuffle":   f"/home/zengyuchen/ReJump/results/{model_name}/game24_0_shot_1_query_{api_name}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25_shuffle/metric_df.csv",
        "rephrase":  f"/home/zengyuchen/ReJump/results/{model_name}/game24_0_shot_1_query_{api_name}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25_rephrase/metric_df.csv"
    }
    dfs = {k: safe_read_csv(v) for k, v in paths.items()}

    # Only include non-None DataFrames
    seed = [dfs["default_0"], dfs["default_1"], dfs["default_2"]]
    seed = [df for df in seed if df is not None]
    prompt = [dfs["default_0"], dfs["shuffle"], dfs["rephrase"]]
    prompt = [df for df in prompt if df is not None]

    return {
        "seed": seed,
        "prompt": prompt,
    }

def average_std_across_rows(dfs):
    """
    Given N dataframes (with same columns), match their indexes,
    remove rows that do not exist in all dfs, and then compute the per-cell std-dev across runs,
    finally average std-dev for each column.

    Returns:
        pd.Series: Average standard deviation for each column (across all matched rows).
    """
    if not dfs:
        return pd.Series(dtype=float)
    # Defensive: force type to float and align columns explicitly
    cols = [c for c in METRICS_COLS if c in dfs[0].columns]
    # Find common indices
    common_index = dfs[0].index
    for df in dfs[1:]:
        common_index = common_index.intersection(df.index)
    if len(common_index) == 0:
        print("No overlapping indices across all dataframes.")
        return pd.Series(dtype=float)
    # Align all dfs to common index and keep only the relevant columns
    aligned_dfs = [df.loc[common_index, cols].astype(float) for df in dfs]
    arr = np.stack([df.values for df in aligned_dfs])
    per_cell_std = np.std(arr, axis=0)
    avg_std_per_col = per_cell_std.mean(axis=0)
    std_of_std_per_col = per_cell_std.std(axis=0)
    return pd.DataFrame({
        "avg_std": avg_std_per_col,
        "std_of_std": std_of_std_per_col
    }, index=cols)



In [3]:
results = get_results(model_name, api_name)
prompt_result = average_std_across_rows(results["prompt"])
seed_result = average_std_across_rows(results["seed"])

for metric in "avg_std", "std_of_std":
    print("--------------------------------")
    print(metric)
    print(prompt_result[metric]/seed_result[metric])

--------------------------------
avg_std
average_solution_count        1.451181
filtered_ajd                  1.445590
success_rates                 1.342598
average_verification_rates    1.395387
overthinking_rates            1.207379
forgetting_rates              1.651613
Name: avg_std, dtype: float64
--------------------------------
std_of_std
average_solution_count        0.929559
filtered_ajd                  1.039391
success_rates                 1.083406
average_verification_rates    0.907450
overthinking_rates            0.950614
forgetting_rates              0.985263
Name: std_of_std, dtype: float64
