In [1]:
import scipy.stats as stats

# Sample data from two independent groups
group_A = [85, 90, 88, 75, 95]
group_B = [70, 65, 78, 80, 72]

# Perform independent samples t-test
t_stat, p_value = stats.ttest_ind(group_A, group_B)

# Print results
print("t-statistic:", t_stat)
print("p-value:", p_value)

# Interpretation
alpha = 0.05
if p_value < alpha:
    print("There is a significant difference between the group means (reject the null hypothesis).")
else:
    print("There is no significant difference between the group means (fail to reject the null hypothesis).")

t-statistic: 3.165359635986947
p-value: 0.013287211494533598
There is a significant difference between the group means (reject the null hypothesis).


wandb api, entity, project, runs: redacted for anonymity (be pushed upon publication)

In [3]:
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
from tqdm import tqdm


# List of run pairs to compare
run_pairs = [
    (
        "blocks.8.hook_resid_pre_12288_topafa_0_0.041666666666666664",
        "blocks.8.hook_resid_pre_12288_topk_2048_0.0"
    ),
    (
        "blocks.8.hook_resid_pre_12288_topafa_0_0.041666666666666664",
        "blocks.8.hook_resid_pre_12288_batchtopk_1024_0.0"
    ),
    (
        "blocks.7.hook_resid_pre_12288_topafa_0_0.015625",
        "blocks.7.hook_resid_pre_12288_topk_8192_0.0"
    ),
    (
        "blocks.7.hook_resid_pre_12288_topafa_0_0.015625",
        "blocks.7.hook_resid_pre_12288_batchtopk_8192_0.0"
    ),
    (
        "blocks.6.hook_resid_pre_12288_topafa_0_0.03125",
        "blocks.6.hook_resid_pre_12288_topk_1024_0.0"
    ),
    (
        "blocks.6.hook_resid_pre_12288_topafa_0_0.03125",
        "blocks.6.hook_resid_pre_12288_batchtopk_1024_0.0"
    ),
]

runs_dict = {run.name: run for run in runs}

# Function to fetch the last 100 sorted l2_loss values using full scan
def fetch_l2(run_name):
    run = runs_dict.get(run_name)
    if run is None:
        raise ValueError(f"Run '{run_name}' not found.")

    # Use scan_history to get all steps
    rows = list(run.scan_history(keys=["l2_loss", "_step"]))
    df = pd.DataFrame(rows)

    # Sort by step and get the last 100 valid l2_loss values
    df_sorted = df.sort_values(by="_step").reset_index(drop=True)
    l2_values = df_sorted["l2_loss"].dropna().tail(100).values
    return l2_values

# Collect results
results = []

for run1, run2 in tqdm(run_pairs):
    l2_1 = fetch_l2(run1)
    l2_2 = fetch_l2(run2)

    t_stat, p_value = ttest_ind(l2_1, l2_2, equal_var=False)
    results.append({
        "Run A": run1,
        "Run B": run2,
        "Mean A": l2_1.mean(),
        "Var A": np.var(l2_1, ddof=1),
        "Mean B": l2_2.mean(),
        "Var B": np.var(l2_2, ddof=1),
        "t-stat": t_stat,
        "p-value": p_value
    })

# Create result DataFrame
df_results = pd.DataFrame(results)
df_results


100%|██████████| 6/6 [01:14<00:00, 12.46s/it]


Unnamed: 0,Run A,Run B,Mean A,Var A,Mean B,Var B,t-stat,p-value
0,blocks.8.hook_resid_pre_12288_topafa_0_0.04166...,blocks.8.hook_resid_pre_12288_topk_2048_0.0,0.000197,2.243817e-09,0.000228,6.157501e-10,-5.768852,4.435151e-08
1,blocks.8.hook_resid_pre_12288_topafa_0_0.04166...,blocks.8.hook_resid_pre_12288_batchtopk_1024_0.0,0.000197,2.243817e-09,0.000222,1.931904e-09,-3.784619,0.0002042609
2,blocks.7.hook_resid_pre_12288_topafa_0_0.015625,blocks.7.hook_resid_pre_12288_topk_8192_0.0,0.000221,7.918709e-10,0.000239,3.958173e-10,-5.192418,5.62448e-07
3,blocks.7.hook_resid_pre_12288_topafa_0_0.015625,blocks.7.hook_resid_pre_12288_batchtopk_8192_0.0,0.000221,7.918709e-10,0.000239,3.958173e-10,-5.192418,5.62448e-07
4,blocks.6.hook_resid_pre_12288_topafa_0_0.03125,blocks.6.hook_resid_pre_12288_topk_1024_0.0,0.000179,1.96038e-09,0.000199,4.04517e-10,-4.243012,4.018208e-05
5,blocks.6.hook_resid_pre_12288_topafa_0_0.03125,blocks.6.hook_resid_pre_12288_batchtopk_1024_0.0,0.000179,1.96038e-09,0.000202,4.607886e-10,-4.669512,6.885479e-06
