# The notebook calculates how much time it takes for a PAS pipeline

In [None]:
# Show all columns and don't truncate long strings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Show all rows 
pd.set_option("display.max_rows", None)

In [42]:
import os
import re
import pandas as pd
import numpy as np

log_dir = "slurm_logs"

# Regex patterns
split_pattern = re.compile(r"with split\s+([0-9/]+)")
elapsed_pattern = re.compile(r"\[Elapsed\]\s+([0-9:.]+)")

records = []

for file in os.listdir(log_dir):
    if file.startswith("timer") and file.endswith(".out"):
        filepath = os.path.join(log_dir, file)
        with open(filepath, "r") as f:
            text = f.read()

        # Find all splits and elapsed times
        splits = split_pattern.findall(text)
        elapsed = elapsed_pattern.findall(text)

        # Pad elapsed list so it matches splits (missing ones → NaN)
        while len(elapsed) < len(splits):
            elapsed.append(np.nan)

        # Combine each experiment
        for s, e in zip(splits, elapsed):
            records.append({
                "filename": file,
                "split": s,
                "elapsed": e
            })

# Build DataFrame
df = pd.DataFrame(records)
df = df.dropna(subset = ['elapsed'])
df

Unnamed: 0,filename,split,elapsed
0,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_10_1334394.out,12/4/800,0:00:26.520580
1,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_10_1334394.out,2400/800/800,0:01:47.531551
2,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_11_1334395.out,12/4/800,0:00:28.070493
3,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_11_1334395.out,2400/800/800,0:01:46.481745
4,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_12_1334396.out,12/4/800,0:00:26.629491
5,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_12_1334396.out,2400/800/800,0:01:49.160280
6,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_13_1334397.out,12/4/800,0:00:26.772897
7,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_13_1334397.out,2400/800/800,0:01:48.794469
8,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_14_1334398.out,12/4/800,0:00:26.408740
9,timerscale_up_DeepSeek-R1-Distill-Llama-8B_seed_14_1334398.out,2400/800/800,0:01:46.653161


In [43]:
import pandas as pd
import numpy as np

# Convert elapsed strings (HH:MM:SS.microseconds) into timedelta
df["elapsed_timedelta"] = pd.to_timedelta(df["elapsed"], errors="coerce")

# Convert to seconds as float
df["elapsed_seconds"] = df["elapsed_timedelta"].dt.total_seconds()

# Example: experiments with '12' in split
mean_fast = df[df['split'].str.contains('12', na=False)]["elapsed_seconds"].mean()
var_fast  = df[df['split'].str.contains('12', na=False)]["elapsed_seconds"].var()

# Experiments without '12'
mean_slow = df[~df['split'].str.contains('12', na=False)]["elapsed_seconds"].mean()
var_slow  = df[~df['split'].str.contains('12', na=False)]["elapsed_seconds"].var()

print("Mean (12/4/800 split):", mean_fast)
print("Variance (12/4/800 split):", var_fast)
print("Mean (2400/800/800 split):", mean_slow)
print("Variance (2400/800/800 split):", var_slow)

Mean (12/4/800 split): 32.805121622222224
Variance (12/4/800 split): 463.53847536602717
Mean (2400/800/800 split): 103.42685363414634
Variance (2400/800/800 split): 15.127384624419985


In [None]:
from pathlib import Path
import re
import pandas as pd

BASE = Path("/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity")

# ---------- helpers ----------
def parse_meta(csv_path: Path):
    rel = csv_path.relative_to(BASE)
    parts = rel.parts

    model = parts[0] if len(parts) >= 1 else None
    rep   = parts[1] if len(parts) >= 2 else None

    # find seed
    seed = None
    for p in parts:
        m = re.match(r"seed[_-](\d+)", p)
        if m:
            seed = int(m.group(1))
            break

    # find sample_size
    sample_size = None
    for p in parts:
        m = re.match(r"max(\d+)", p)
        if m:
            sample_size = int(m.group(1))
            break
    if sample_size is None:
        # if there's no max#### folder, treat as default sample size
        sample_size = 4000

    return model, rep, seed, sample_size


def read_one_csv(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]
    expected = {"Benchmark", "Method", "Layer", "Strength", "Val-Acc", "Test-Acc"}
    missing = expected - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns {missing} in {csv_path}")

    model, rep, seed, sample_size = parse_meta(csv_path)
    df = df.assign(
        model=model,
        rep=rep,
        seed=seed,
        sample_size=sample_size,
        source_csv=str(csv_path),
    )
    for col in ["Layer", "Strength", "Val-Acc", "Test-Acc"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    return df


# ---------- collect ----------
all_csvs = sorted(BASE.rglob("cross_benchmark_summary.csv"))
if not all_csvs:
    raise FileNotFoundError("No cross_benchmark_summary.csv files found under BASE.")

frames = []
for p in all_csvs:
    try:
        frames.append(read_one_csv(p))
    except Exception as e:
        print(f"[WARN] Skipping {p}: {e}")

all_df = pd.concat(frames, ignore_index=True)
all_df = all_df.dropna(subset=["sample_size"]).drop(columns=["rep"])

# Save all raw rows for debugging
all_df.to_csv("all_rows_with_meta.csv", index=False)

# ---------- best Val-Acc per (model, seed, sample_size, Benchmark, Method) ----------
group_keys = ["model", "seed", "sample_size", "Benchmark", "Method"]
idx = all_df.groupby(group_keys, dropna=False)["Val-Acc"].idxmax()
best_df = all_df.loc[idx].copy()

# ---------- map sample_size sums if needed ----------
split_map = {
    816: 12, 832: 24, 860: 48, 900: 75, 1000: 150, 1200: 300,
    1600: 600, 2400: 1200, 4000: 2400, 
}
def sum_to_first(sample_sum):
    return split_map.get(int(sample_sum), int(sample_sum))

best_df["sample_size"] = best_df["sample_size"].apply(sum_to_first)

# ---------- attach unsteered baseline ----------
baseline = (
    best_df[best_df["Method"].str.lower() == "unsteered"]
    .rename(columns={"Test-Acc": "baseline_Test-Acc",
                     "Val-Acc": "baseline_Val-Acc"})
    [["model","seed","sample_size","Benchmark","baseline_Test-Acc","baseline_Val-Acc"]]
)

best_df = best_df.merge(
    baseline,
    on=["model","seed","sample_size","Benchmark"],
    how="left",
    validate="many_to_one"
)

# ---------- compute improvement ----------
best_df["improve_Test-Acc"] = best_df["Test-Acc"] - best_df["baseline_Test-Acc"]
best_df["improve_Val-Acc"]  = best_df["Val-Acc"] - best_df["baseline_Val-Acc"]

# ---------- tidy order ----------
best_df = best_df[
    ["model","seed","sample_size","Benchmark","Method","Strength",
     "Val-Acc","baseline_Val-Acc","improve_Val-Acc",
     "Test-Acc","baseline_Test-Acc","improve_Test-Acc","source_csv"]
].sort_values(["model","Benchmark","Method","sample_size","seed"])

best_df.head()

Unnamed: 0,model,seed,sample_size,Benchmark,Method,Strength,Val-Acc,baseline_Val-Acc,improve_Val-Acc,Test-Acc,baseline_Test-Acc,improve_Test-Acc,source_csv
312,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,17,12,ETHICS_Commonsense,BAS_full_mcq,16.0,0.833333,0.75,0.083333,0.705882,0.692115,0.013767,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_17/max816/results/tables/cross_benchmark_summary.csv
628,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,18,12,ETHICS_Commonsense,BAS_full_mcq,0.25,0.916667,0.833333,0.083333,0.708385,0.703379,0.005006,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_18/max816/results/tables/cross_benchmark_summary.csv
940,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,19,12,ETHICS_Commonsense,BAS_full_mcq,13.0,0.833333,0.666667,0.166667,0.64831,0.703379,-0.055069,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_19/max816/results/tables/cross_benchmark_summary.csv
1256,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,20,12,ETHICS_Commonsense,BAS_full_mcq,0.5,0.666667,0.5,0.166667,0.639549,0.687109,-0.047559,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_20/max816/results/tables/cross_benchmark_summary.csv
1572,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,21,12,ETHICS_Commonsense,BAS_full_mcq,32.0,0.833333,0.666667,0.166667,0.564456,0.707134,-0.142678,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_21/max816/results/tables/cross_benchmark_summary.csv


In [108]:
best_df.head()

Unnamed: 0,model,seed,sample_size,Benchmark,Method,Strength,Val-Acc,baseline_Val-Acc,improve_Val-Acc,Test-Acc,baseline_Test-Acc,improve_Test-Acc,source_csv
312,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,17,12,ETHICS_Commonsense,BAS_full_mcq,16.0,0.833333,0.75,0.083333,0.705882,0.692115,0.013767,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_17/max816/results/tables/cross_benchmark_summary.csv
628,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,18,12,ETHICS_Commonsense,BAS_full_mcq,0.25,0.916667,0.833333,0.083333,0.708385,0.703379,0.005006,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_18/max816/results/tables/cross_benchmark_summary.csv
940,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,19,12,ETHICS_Commonsense,BAS_full_mcq,13.0,0.833333,0.666667,0.166667,0.64831,0.703379,-0.055069,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_19/max816/results/tables/cross_benchmark_summary.csv
1256,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,20,12,ETHICS_Commonsense,BAS_full_mcq,0.5,0.666667,0.5,0.166667,0.639549,0.687109,-0.047559,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_20/max816/results/tables/cross_benchmark_summary.csv
1572,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,21,12,ETHICS_Commonsense,BAS_full_mcq,32.0,0.833333,0.666667,0.166667,0.564456,0.707134,-0.142678,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_21/max816/results/tables/cross_benchmark_summary.csv


In [107]:
best_df[best_df['sample_size']==2400].head()

Unnamed: 0,model,seed,sample_size,Benchmark,Method,Strength,Val-Acc,baseline_Val-Acc,improve_Val-Acc,Test-Acc,baseline_Test-Acc,improve_Test-Acc,source_csv
9080,deepseek-ai-DeepSeek-R1-Distill-Llama-8B,16,2400,ETHICS_Commonsense,BAS_full_mcq,19.0,0.916667,0.25,0.666667,0.635795,0.505632,0.130163,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/deepseek-ai-DeepSeek-R1-Distill-Llama-8B/residual/seed_16/tables/cross_benchmark_summary.csv
9440,deepseek-ai-DeepSeek-R1-Distill-Llama-8B,17,2400,ETHICS_Commonsense,BAS_full_mcq,0.75,0.75,0.5,0.25,0.633292,0.498123,0.135169,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/deepseek-ai-DeepSeek-R1-Distill-Llama-8B/residual/seed_17/tables/cross_benchmark_summary.csv
9800,deepseek-ai-DeepSeek-R1-Distill-Llama-8B,18,2400,ETHICS_Commonsense,BAS_full_mcq,10.0,0.833333,0.5,0.333333,0.530663,0.573217,-0.042553,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/deepseek-ai-DeepSeek-R1-Distill-Llama-8B/residual/seed_18/tables/cross_benchmark_summary.csv
10160,deepseek-ai-DeepSeek-R1-Distill-Llama-8B,19,2400,ETHICS_Commonsense,BAS_full_mcq,28.0,0.833333,0.583333,0.25,0.543179,0.589487,-0.046308,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/deepseek-ai-DeepSeek-R1-Distill-Llama-8B/residual/seed_19/tables/cross_benchmark_summary.csv
10516,deepseek-ai-DeepSeek-R1-Distill-Llama-8B,20,2400,ETHICS_Commonsense,BAS_full_mcq,32.0,0.75,0.5,0.25,0.523154,0.555695,-0.032541,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/deepseek-ai-DeepSeek-R1-Distill-Llama-8B/residual/seed_20/tables/cross_benchmark_summary.csv


In [109]:
best_df.to_csv("best_valacc_per_group.csv", index=False)

print("Done.")
print("all_rows_with_meta.csv -> every row + metadata")
print("best_valacc_per_group.csv -> 1 row per (model,rep,seed,sample_size,benchmark,method) at best Val-Acc")
print(best_df.head(12))

Done.
all_rows_with_meta.csv -> every row + metadata
best_valacc_per_group.csv -> 1 row per (model,rep,seed,sample_size,benchmark,method) at best Val-Acc
                                          model  seed  sample_size  \
312   NousResearch-Nous-Hermes-2-Mistral-7B-DPO    17           12   
628   NousResearch-Nous-Hermes-2-Mistral-7B-DPO    18           12   
940   NousResearch-Nous-Hermes-2-Mistral-7B-DPO    19           12   
1256  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    20           12   
1572  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    21           12   
1892  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    22           12   
2208  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    23           12   
2516  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    24           12   
2832  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    25           12   
3152  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    26           12   
3464  NousResearch-Nous-Hermes-2-Mistral-7B-DPO    27           12   
3776  

In [110]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import TheilSenRegressor
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# I/O
IN_CSV  = "best_valacc_per_group.csv"   # path to your combined file
OUT_DIR = Path("sample_size_analysis")
FIG_DIR = OUT_DIR / "figures"
OUT_DIR.mkdir(exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [118]:
df = pd.read_csv(IN_CSV)

# normalize column names we use
df = df.rename(columns={
    "Benchmark": "benchmark",
    "Method": "method",
    "improve_Test-Acc": "improve_Test-Acc",
    "Val-Acc": "val_acc",
    "Strength": "strength",
    "sample_size": "sample_size",
    "model": "model",
    "seed": "seed"
})

# keep only what we need
need = ["model","benchmark","method","seed","sample_size","improve_Test-Acc"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df["sample_size"] = pd.to_numeric(df["sample_size"], errors="coerce")
df["improve_Test-Acc"]    = pd.to_numeric(df["improve_Test-Acc"], errors="coerce")
df = df.dropna(subset=["sample_size","improve_Test-Acc"]).copy()

# Treat seeds as categorical IDs for mixed models
df["seed"] = df["seed"].astype(str)

# Per-doubling regressor
df["log2_size"] = np.log2(df["sample_size"])

df.head()

Unnamed: 0,model,seed,sample_size,benchmark,method,strength,val_acc,baseline_Val-Acc,improve_Val-Acc,Test-Acc,baseline_Test-Acc,improve_Test-Acc,source_csv,log2_size
0,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,17,12,ETHICS_Commonsense,BAS_full_mcq,16.0,0.833333,0.75,0.083333,0.705882,0.692115,0.013767,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_17/max816/results/tables/cross_benchmark_summary.csv,3.584963
1,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,18,12,ETHICS_Commonsense,BAS_full_mcq,0.25,0.916667,0.833333,0.083333,0.708385,0.703379,0.005006,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_18/max816/results/tables/cross_benchmark_summary.csv,3.584963
2,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,19,12,ETHICS_Commonsense,BAS_full_mcq,13.0,0.833333,0.666667,0.166667,0.64831,0.703379,-0.055069,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_19/max816/results/tables/cross_benchmark_summary.csv,3.584963
3,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,20,12,ETHICS_Commonsense,BAS_full_mcq,0.5,0.666667,0.5,0.166667,0.639549,0.687109,-0.047559,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_20/max816/results/tables/cross_benchmark_summary.csv,3.584963
4,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,21,12,ETHICS_Commonsense,BAS_full_mcq,32.0,0.833333,0.666667,0.166667,0.564456,0.707134,-0.142678,/nfs/roberts/project/pi_jss233/zc362/activation_steering/Benchmark_Activation_Steering/output_sample_size_sensitivity/NousResearch-Nous-Hermes-2-Mistral-7B-DPO/residual/seed_21/max816/results/tables/cross_benchmark_summary.csv,3.584963


In [120]:
df['log2_size'].value_counts()

log2_size
7.228819     2400
8.228819     2400
6.228819     2400
5.584963     2388
4.584963     2376
9.228819     2156
3.584963     2136
10.228819    1916
11.228819     568
Name: count, dtype: int64

In [121]:
group_cols = ["model","benchmark","method"]
rows = []

for (m, b, meth), g in df.groupby(group_cols):
    g = g.dropna(subset=["log2_size","improve_Test-Acc"]).copy()
    if g.empty:
        continue

    n_sizes  = g["sample_size"].nunique()
    n_seeds  = g["seed"].nunique()
    n_points = g[["sample_size","seed"]].drop_duplicates().shape[0]

    # Spearman (monotone trend vs size)
    try:
        rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
    except Exception:
        rho, rho_p = np.nan, np.nan

    # Theil–Sen (robust) slope on log2_size
    ts_slope, ts_intercept = np.nan, np.nan
    try:
        ts = TheilSenRegressor(random_state=0)
        X = g[["log2_size"]].values
        y = g["improve_Test-Acc"].values
        ts.fit(X, y)
        ts_slope = float(ts.coef_[0])       # Δ Test-Acc per doubling
        ts_intercept = float(ts.intercept_)
    except Exception:
        pass

    # OLS slope on log2_size (per-doubling effect)
    ols_b, ols_p, ols_ci_lo, ols_ci_hi, r2, aic = [np.nan]*6
    try:
        model_ols = sm.OLS(g["improve_Test-Acc"].values, sm.add_constant(g["log2_size"].values))
        res = model_ols.fit()
        r2  = res.rsquared
        aic = res.aic
        ols_b = float(res.params[1])
        ols_p = float(res.pvalues[1])
        ci_lo, ci_hi = res.conf_int().iloc[1]
        ols_ci_lo, ols_ci_hi = float(ci_lo), float(ci_hi)
    except Exception:
        pass

    # Mixed-effects: random intercepts for seed; fixed slope on log2_size (if possible)
    mx_b, mx_p, mx_se, mx_aic = [np.nan]*4
    if n_seeds >= 2 and n_sizes >= 2:
        try:
            mx = smf.mixedlm("improve_Test-Acc ~ log2_size", g, groups=g["seed"])
            mx_res = mx.fit(method="lbfgs", maxiter=500, reml=True, disp=False)
            if "log2_size" in mx_res.params.index:
                from scipy.stats import norm
                mx_b  = float(mx_res.params["log2_size"])
                mx_se = float(mx_res.bse["log2_size"])
                # Wald z, two-sided p via normal approx
                z = float(mx_res.tvalues["log2_size"])
                mx_p = float(2 * (1 - norm.cdf(abs(z))))
                mx_aic = float(mx_res.aic)
        except Exception:
            pass

    rows.append({
        "model": m, "benchmark": b, "method": meth,
        "n_sample_sizes": n_sizes,
        "n_seeds": n_seeds,
        "n_points": n_points,
        "spearman_rho": rho, "spearman_p": rho_p,
        "theilsen_slope_per_doubling": ts_slope,
        "ols_slope_per_doubling": ols_b,
        "ols_slope_p": ols_p,
        "ols_slope_ci_lo": ols_ci_lo, "ols_slope_ci_hi": ols_ci_hi,
        "ols_r2": r2, "ols_aic": aic,
        "mixed_slope_per_doubling": mx_b,
        "mixed_slope_se": mx_se,
        "mixed_slope_p_approx": mx_p,
        "mixed_aic": mx_aic,
    })

per_group = pd.DataFrame(rows).sort_values(group_cols)
per_group_path = OUT_DIR / "per_group_summary.csv"
per_group.to_csv(per_group_path, index=False)
per_group.head(10)

  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  rho, rho_p = spearmanr(g["sample_size"], g["improve_Test-Acc"])
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - n

Unnamed: 0,model,benchmark,method,n_sample_sizes,n_seeds,n_points,spearman_rho,spearman_p,theilsen_slope_per_doubling,ols_slope_per_doubling,ols_slope_p,ols_slope_ci_lo,ols_slope_ci_hi,ols_r2,ols_aic,mixed_slope_per_doubling,mixed_slope_se,mixed_slope_p_approx,mixed_aic
0,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,8,15,118,0.589577,2.154463e-12,0.00654,0.009554,3.806769e-11,,,0.315039,-489.468959,,,,
1,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,Unsteered,8,15,118,,,0.0,0.0,,,,,-inf,,,,
2,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,iBAS_all,8,15,118,0.502922,6.48526e-09,0.004508,0.009966,7.033284e-09,,,0.25191,-442.717012,,,,
3,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,iBAS_wrong_only,8,15,118,0.392143,1.124959e-05,0.003137,0.006112,3.039406e-06,,,0.171929,-501.041057,,,,
4,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Deontology,BAS_full_mcq,8,15,120,0.311843,0.0005259187,0.011437,0.008559,0.0005820008,,,0.095795,-345.229738,,,,
5,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Deontology,Unsteered,8,15,120,,,0.0,0.0,,,,,-inf,,,,
6,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Deontology,iBAS_all,8,15,120,0.191303,0.03634413,0.012784,0.006761,0.01838801,,,0.046206,-307.899658,,,,
7,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Deontology,iBAS_wrong_only,8,15,120,0.229729,0.01160063,0.012747,0.008065,0.007743109,,,0.05857,-295.596399,,,,
8,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Justice,BAS_full_mcq,8,15,118,0.482689,3.106167e-08,0.004867,0.008182,1.861849e-07,,,0.20961,-461.064773,,,,
9,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Justice,Unsteered,8,15,118,,,0.0,0.0,,,,,-inf,,,,


In [122]:
agg = (
    df.groupby(group_cols + ["sample_size"], as_index=False)
      .agg(improve_mean=("improve_Test-Acc","mean"),
           improve_std =("improve_Test-Acc","std"),
           n=("improve_Test-Acc","count"))
)
agg["improve_stderr"] = agg["improve_std"] / np.sqrt(agg["n"].clip(lower=1))

agg_path = OUT_DIR / "per_size_agg.csv"
agg.to_csv(agg_path, index=False)
agg.head(10)

Unnamed: 0,model,benchmark,method,sample_size,improve_mean,improve_std,n,improve_stderr
0,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,12,-0.033696,0.053105,13,0.014729
1,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,24,-0.030038,0.025214,15,0.00651
2,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,48,-0.009583,0.029031,15,0.007496
3,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,75,0.0005,0.035474,15,0.009159
4,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,150,0.011667,0.020915,15,0.0054
5,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,300,0.018917,0.026002,15,0.006714
6,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,600,0.022917,0.020829,15,0.005378
7,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,BAS_full_mcq,1200,0.023363,0.02264,15,0.005846
8,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,Unsteered,12,0.0,0.0,13,0.0
9,NousResearch-Nous-Hermes-2-Mistral-7B-DPO,ETHICS_Commonsense,Unsteered,24,0.0,0.0,15,0.0


In [124]:
agg['sample_size'].value_counts()

sample_size
12      160
24      160
48      160
75      160
150     160
300     160
600     160
1200    160
2400     40
Name: count, dtype: int64

In [125]:
# A tractable global spec: baseline slope + modifiers by model/method/benchmark.
# Random intercepts for seed if possible; else OLS fallback.
formula = "Q('improve_Test-Acc')~ log2_size + C(model)*log2_size + C(method)*log2_size + C(benchmark)*log2_size"

def fit_global(formula, data):
    try:
        mx = smf.mixedlm(formula, data, groups=data["seed"])
        res = mx.fit(method="lbfgs", maxiter=1000, reml=True, disp=False)
        return res, "mixedlm"
    except Exception:
        ols = smf.ols(formula, data=data).fit()
        return ols, "ols"

global_fit, kind = fit_global(formula, df)
print("Global model kind:", kind)

coefs = pd.DataFrame({
    "coef": global_fit.params,
    "se": getattr(global_fit, "bse", pd.Series(index=global_fit.params.index, dtype=float)),
    "t_or_z": getattr(global_fit, "tvalues", pd.Series(index=global_fit.params.index, dtype=float)),
    "p_value": getattr(global_fit, "pvalues", pd.Series(index=global_fit.params.index, dtype=float)),
})
try:
    coefs.attrs["AIC"] = float(global_fit.aic)
except Exception:
    pass

coefs_path = OUT_DIR / "global_mixed_model_coefs.csv"
coefs.to_csv(coefs_path)
coefs.head(20)

Global model kind: mixedlm




Unnamed: 0,coef,se,t_or_z,p_value
Intercept,-5.564527e-14,,,
C(model)[T.TinyLlama-TinyLlama-1.1B-Chat-v1.0],0.01673569,0.005777,2.896873,0.003769022
C(model)[T.deepseek-ai-DeepSeek-R1-Distill-Llama-8B],0.01217753,0.005073,2.400365,0.01637873
C(model)[T.meta-llama-Llama-3.1-8B-Instruct],0.002116286,0.005415,0.390788,0.6959539
C(method)[T.Unsteered],-0.02470571,0.005275,-4.683837,2.815534e-06
C(method)[T.iBAS_all],0.001866255,0.005275,0.353814,0.723478
C(method)[T.iBAS_wrong_only],0.01415688,0.005275,2.683935,0.007276134
C(benchmark)[T.ETHICS_Deontology],0.03904769,0.008352,4.674999,2.939557e-06
C(benchmark)[T.ETHICS_Justice],-0.001362708,0.008254,-0.1651,0.868865
C(benchmark)[T.GenderIdentity],0.00852551,0.008313,1.025515,0.3051201


In [126]:
def plot_panel(a, model, benchmark, method, save_dir=FIG_DIR):
    sub = a.query("model == @model and benchmark == @benchmark and method == @method").copy()
    if sub.empty: 
        return None
    sub = sub.sort_values("sample_size")
    plt.figure()
    plt.errorbar(
        sub["sample_size"], 
        sub["improve_mean"],                # mean improvement
        yerr=sub["improve_stderr"],         # standard error of improvement
        fmt="o-", capsize=3
    )
    plt.xscale("log", base=2)
    plt.xlabel("sample_size (log₂ scale)")
    plt.ylabel("Improve-Test-Acc (mean ± s.e.)")
    plt.title(f"{model} | {benchmark} | {method}")
    plt.grid(True, alpha=0.3)
    fname = f"{model}__{benchmark}__{method}.png".replace("/", "_")
    plt.tight_layout()
    path = save_dir / fname
    plt.savefig(path, dpi=150)
    plt.close()
    return path

top = (
    per_group.dropna(subset=["ols_slope_per_doubling"])
             .assign(abs_slope=lambda x: x["ols_slope_per_doubling"].abs())
             .sort_values("abs_slope", ascending=False)
)
made = []
for _, r in top.head(12).iterrows():
    p = plot_panel(agg, r["model"], r["benchmark"], r["method"])
    if p: made.append(p)

made[:5]

[PosixPath('sample_size_analysis/figures/meta-llama-Llama-3.1-8B-Instruct__RaceXGender__BAS_full_mcq.png'),
 PosixPath('sample_size_analysis/figures/NousResearch-Nous-Hermes-2-Mistral-7B-DPO__RaceXGender__iBAS_all.png'),
 PosixPath('sample_size_analysis/figures/NousResearch-Nous-Hermes-2-Mistral-7B-DPO__RaceEthnicity__BAS_full_mcq.png'),
 PosixPath('sample_size_analysis/figures/TinyLlama-TinyLlama-1.1B-Chat-v1.0__Sycophancy__iBAS_wrong_only.png'),
 PosixPath('sample_size_analysis/figures/NousResearch-Nous-Hermes-2-Mistral-7B-DPO__RaceXGender__iBAS_wrong_only.png')]

In [127]:
plt.figure()
valid = per_group["ols_slope_per_doubling"].dropna()
plt.hist(valid, bins=30)
plt.axvline(valid.mean(), linestyle="--")
plt.title("Δ Test-Acc per doubling of sample_size (OLS slopes)")
plt.xlabel("per-doubling slope")
plt.ylabel("count")
path = FIG_DIR / "slope_distribution.png"
plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
path


PosixPath('sample_size_analysis/figures/slope_distribution.png')

In [128]:
from IPython.display import display, HTML

display(HTML(f"""
<h3>Artifacts</h3>
<ul>
  <li><code>{per_group_path}</code> — one row per (model × benchmark × method)
    <ul>
      <li><b>ols_slope_per_doubling</b> (+ CI & p): Δ Test-Acc per doubling of sample size</li>
      <li><b>spearman_rho</b>/<b>spearman_p</b>: monotone trend evidence</li>
      <li><b>theilsen_slope_per_doubling</b>: robust median slope</li>
      <li><b>mixed_slope_per_doubling</b>: slope w/ random intercepts for seed (if fit)</li>
    </ul>
  </li>
  <li><code>{agg_path}</code> — per sample_size mean/SD/SE for plotting</li>
  <li><code>{coefs_path}</code> — global model fixed effects (baseline + modifiers)</li>
  <li>Figures in <code>{FIG_DIR}</code></li>
</ul>
<p><i>Interpretation tip:</i> coefficient on <code>log2_size</code> ≈ change in Test-Acc for a 2× increase in sample_size.</p>
"""))
