In [1]:
import pandas as pd

In [2]:
def get_csv_path(model_name, model_type, dataset_name):
    if dataset_name == "math500":
        csv_path = f"/home/zengyuchen/ReJump/results/{model_name.replace('/', '-')}/math500_0_shot_1_query_{model_type}_reslen_404_nsamples_-1_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_v3/metric_df.csv"
    else: 
        csv_path = f"/home/zengyuchen/ReJump/results/{model_name.replace('/', '-')}/game24_0_shot_1_query_{model_type}_reslen_404_nsamples_100_noise_None_flip_rate_0.0_mode_default/temperature_1.00/replicate_0/global_step_0/tree_vis_google/gemini-2.5-pro-preview-03-25/metric_df.csv"
    return csv_path

In [3]:
dataset = "game24"

dfs = []
for model_name, model_type in [
    ("deepseek-ai/deepseek-reasoner", "reasoning_api"), 
    ("xai/grok-3-mini-beta", "reasoning_api"),
    ("claude/claude-3-7-sonnet-20250219-thinking", "reasoning_api"),
    ("openrouter-qwen/qwq-32b", "reasoning_api"),
    ("openrouter-microsoft/phi-4-reasoning-plus", "standard_api"),
]:
    csv_path = get_csv_path(model_name, model_type, dataset)
    df = pd.read_csv(csv_path)
    dfs.append(df)

df = pd.concat(dfs)


In [4]:
metrics = {
    "forgetting_rates": {
        "label": "Forgetting Rate",
    },
    "average_solution_count": {
        "label": "Average Solution Count",
    },
    "average_verification_rates": {
        "label": "Average Verification Rate",
    },
    "filtered_ajd": {
        "label": "Average Jump Distance",
    },
    "success_rates": {
        "label": "Average Success Rate",
    },
    "overthinking_rates": {
        "label": "Average Overthinking Rate",
    },
}

import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mutual_info_score

# Function to compute entropy from value counts
def entropy(values):
    probs = np.array(list(values.value_counts(normalize=True)))
    probs = probs[probs > 0]
    return -np.sum(probs * np.log2(probs))

# Discretize each metric column (since entropy for continuous variables requires discretization)
n_bins = 10  # You may want to adjust this bin count based on your metric distributions

metrics_list = list(metrics.keys())
metrics_labels = [metrics[m]['label'] for m in metrics_list]
X = df[metrics_list].to_numpy()
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
X_disc = discretizer.fit_transform(X)

redundancy_results = {}

for idx, metric in enumerate(metrics_list):
    M_disc = X_disc[:, idx]
    # All other metrics, concatenated into 1d tuples so we can treat as a joint variable
    others_indices = [i for i in range(len(metrics_list)) if i != idx]
    others_disc = X_disc[:, others_indices]
    others_tuples = [tuple(row) for row in others_disc]
    # Compute marginal entropy H(M)
    H_M = entropy(pd.Series(M_disc))
    # Compute conditional entropy H(M | others)
    # H(M | others) = H(M, others) - H(others)
    joint_tuples = [tuple([M_disc[i]] + list(others_disc[i])) for i in range(len(M_disc))]
    H_joint = entropy(pd.Series(joint_tuples))
    H_others = entropy(pd.Series(others_tuples))
    H_M_given_others = H_joint - H_others

    # Redundancy ratio
    if H_M > 0:
        redundancy_ratio = 1 - (H_M_given_others / H_M)
    else:
        redundancy_ratio = np.nan
    
    redundancy_results[metric] = {
        "label": metrics[metric]["label"],
        "H(M)": H_M,
        "H(M|others)": H_M_given_others,
        "redundancy": redundancy_ratio,
    }

# Display the results as a DataFrame
redundancy_df = pd.DataFrame.from_dict(redundancy_results, orient="index")
display(redundancy_df)


Unnamed: 0,label,H(M),H(M|others),redundancy
forgetting_rates,Forgetting Rate,0.866166,0.138955,0.839574
average_solution_count,Average Solution Count,1.994366,0.537093,0.730695
average_verification_rates,Average Verification Rate,2.671699,0.836795,0.686793
filtered_ajd,Average Jump Distance,2.266411,0.542449,0.760657
success_rates,Average Success Rate,2.813175,0.332449,0.881824
overthinking_rates,Average Overthinking Rate,2.387393,0.381327,0.840275
