# ICA decoding — Part 2 (Modeling + validation)
This mirrors your PCA Part 2, using the ICA feature files created in Part 1.

It prints results for:
- **L1-only**
- **Ln-only**
- **Delta-only (Ln−L1)**
- **Full (L1+Ln+Delta)**


In [None]:

# =========================
# PART 2 — ICA MODELING + VALIDATION
# Mirrors your PCA Part 2 exactly, but uses ICA feature files (IC-based names).
# =========================
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    LeaveOneOut,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_predict,
    permutation_test_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")  # <-- SAME AS PART 1
LABELS_CSV = ROOT / "proficiency_labels.csv"
K = 20

FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"   # used in Part 3
FEAT_DELTA_CONN = ROOT / f"features_delta_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
y = df["y"].to_numpy()

X_L1    = df.filter(regex=r"^L1_").copy()
X_Ln    = df.filter(regex=r"^Ln_").copy()
X_Delta = df.filter(regex=r"^DELTA_").copy()
X_Full  = df.drop(columns=["y"]).copy()

print("N aligned:", df.shape[0], "class counts:", labels["group"].value_counts().to_dict())
print("Shapes:", "L1", X_L1.shape, "Ln", X_Ln.shape, "Delta", X_Delta.shape, "Full", X_Full.shape)

def fixed_pipeline(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])

def loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0):
    loo = LeaveOneOut()
    prob = cross_val_predict(pipe, Xdf, y, cv=loo, method="predict_proba")[:, 1]
    auc = roc_auc_score(y, prob)

    rng = np.random.default_rng(seed)
    idx = np.arange(len(y))
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(idx, size=len(idx), replace=True)
        if len(np.unique(y[samp])) < 2:
            continue
        boots.append(roc_auc_score(y[samp], prob[samp]))
    boots = np.array(boots)
    ci_lo, ci_hi = np.quantile(boots, [0.025, 0.975])
    return float(auc), (float(ci_lo), float(ci_hi))

def repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    aucs, bals = [], []
    for tr, te in rskf.split(Xdf, y):
        pipe.fit(Xdf.iloc[tr], y[tr])
        p = pipe.predict_proba(Xdf.iloc[te])[:, 1]
        pred = (p >= 0.5).astype(int)
        aucs.append(roc_auc_score(y[te], p))
        bals.append(balanced_accuracy_score(y[te], pred))
    return np.array(aucs), np.array(bals)

def minimal_nested_tuning(Xdf, y,
                         C_grid=(0.001, 0.01, 0.1, 1, 10, 100, 1000),
                         penalty_grid=("l2", "l1"),
                         outer_splits=5, outer_repeats=100, inner_splits=5, seed=42):
    outer = RepeatedStratifiedKFold(n_splits=outer_splits, n_repeats=outer_repeats, random_state=seed)
    inner = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=seed)

    chosen = []
    outer_auc = []

    for tr, te in outer.split(Xdf, y):
        X_tr, X_te = Xdf.iloc[tr], Xdf.iloc[te]
        y_tr, y_te = y[tr], y[te]

        best_auc = -np.inf
        best_hp = None

        for pen in penalty_grid:
            for C in C_grid:
                pipe = fixed_pipeline(C=C, penalty=pen)
                inner_aucs = []
                for tr2, te2 in inner.split(X_tr, y_tr):
                    pipe.fit(X_tr.iloc[tr2], y_tr[tr2])
                    p = pipe.predict_proba(X_tr.iloc[te2])[:, 1]
                    inner_aucs.append(roc_auc_score(y_tr[te2], p))
                m = float(np.mean(inner_aucs))
                if m > best_auc:
                    best_auc, best_hp = m, (pen, C)

        pen, C = best_hp
        pipe = fixed_pipeline(C=C, penalty=pen)
        pipe.fit(X_tr, y_tr)
        p = pipe.predict_proba(X_te)[:, 1]
        outer_auc.append(float(roc_auc_score(y_te, p)))
        chosen.append(best_hp)

    from collections import Counter
    counts = Counter(chosen)
    return float(np.mean(outer_auc)), float(np.std(outer_auc)), counts

def fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    score, perm_scores, pvalue = permutation_test_score(
        pipe, Xdf, y,
        scoring="roc_auc",
        cv=cv,
        n_permutations=n_perm,
        n_jobs=-1,
        random_state=seed
    )
    return float(score), float(pvalue)

def run_bundle(name, Xdf):
    print("\n======", name, "======")
    pipe = fixed_pipeline(C=1.0, penalty="l2")

    auc_loocv, ci = loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0)
    print("LOOCV AUC:", auc_loocv, "Bootstrap 95% CI:", ci)

    aucs, bals = repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42)
    print("RepCV AUC mean±sd:", float(aucs.mean()), float(aucs.std()))
    print("RepCV BAL mean±sd:", float(bals.mean()), float(bals.std()))

    outer_mean, outer_sd, counts = minimal_nested_tuning(Xdf, y, outer_repeats=50)
    print("Nested (tune C/penalty only) outer AUC mean±sd:", outer_mean, outer_sd)
    print("Top hyperparams:", counts.most_common(5))

    score, pval = fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0)
    print("Permutation (5-fold) AUC score:", score, "p-value:", pval)

# Run models
run_bundle("L1-only (negative control)", X_L1)
run_bundle("Ln-only", X_Ln)
run_bundle("Delta-only", X_Delta)
run_bundle("FULL (L1+Ln+Delta)", X_Full)


In [None]:

# =========================
# PART 2 — ICA MODELING + VALIDATION
# Mirrors your PCA Part 2 exactly, but uses ICA feature files (IC-based names).
# =========================
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    LeaveOneOut,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_predict,
    permutation_test_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")  # <-- SAME AS PART 1
LABELS_CSV = ROOT / "proficiency_labels.csv"
K = 20

FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"   # used in Part 3
FEAT_DELTA_CONN = ROOT / f"features_delta_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
y = df["y"].to_numpy()

X_L1    = df.filter(regex=r"^L1_").copy()
X_Ln    = df.filter(regex=r"^Ln_").copy()
X_Delta = df.filter(regex=r"^DELTA_").copy()
X_Full  = df.drop(columns=["y"]).copy()

print("N aligned:", df.shape[0], "class counts:", labels["group"].value_counts().to_dict())
print("Shapes:", "L1", X_L1.shape, "Ln", X_Ln.shape, "Delta", X_Delta.shape, "Full", X_Full.shape)

def fixed_pipeline(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])

def loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0):
    loo = LeaveOneOut()
    prob = cross_val_predict(pipe, Xdf, y, cv=loo, method="predict_proba")[:, 1]
    auc = roc_auc_score(y, prob)

    rng = np.random.default_rng(seed)
    idx = np.arange(len(y))
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(idx, size=len(idx), replace=True)
        if len(np.unique(y[samp])) < 2:
            continue
        boots.append(roc_auc_score(y[samp], prob[samp]))
    boots = np.array(boots)
    ci_lo, ci_hi = np.quantile(boots, [0.025, 0.975])
    return float(auc), (float(ci_lo), float(ci_hi))

def repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    aucs, bals = [], []
    for tr, te in rskf.split(Xdf, y):
        pipe.fit(Xdf.iloc[tr], y[tr])
        p = pipe.predict_proba(Xdf.iloc[te])[:, 1]
        pred = (p >= 0.5).astype(int)
        aucs.append(roc_auc_score(y[te], p))
        bals.append(balanced_accuracy_score(y[te], pred))
    return np.array(aucs), np.array(bals)

def minimal_nested_tuning(Xdf, y,
                         C_grid=(0.001, 0.01, 0.1, 1, 10, 100, 1000),
                         penalty_grid=("l2", "l1"),
                         outer_splits=5, outer_repeats=100, inner_splits=5, seed=42):
    outer = RepeatedStratifiedKFold(n_splits=outer_splits, n_repeats=outer_repeats, random_state=seed)
    inner = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=seed)

    chosen = []
    outer_auc = []

    for tr, te in outer.split(Xdf, y):
        X_tr, X_te = Xdf.iloc[tr], Xdf.iloc[te]
        y_tr, y_te = y[tr], y[te]

        best_auc = -np.inf
        best_hp = None

        for pen in penalty_grid:
            for C in C_grid:
                pipe = fixed_pipeline(C=C, penalty=pen)
                inner_aucs = []
                for tr2, te2 in inner.split(X_tr, y_tr):
                    pipe.fit(X_tr.iloc[tr2], y_tr[tr2])
                    p = pipe.predict_proba(X_tr.iloc[te2])[:, 1]
                    inner_aucs.append(roc_auc_score(y_tr[te2], p))
                m = float(np.mean(inner_aucs))
                if m > best_auc:
                    best_auc, best_hp = m, (pen, C)

        pen, C = best_hp
        pipe = fixed_pipeline(C=C, penalty=pen)
        pipe.fit(X_tr, y_tr)
        p = pipe.predict_proba(X_te)[:, 1]
        outer_auc.append(float(roc_auc_score(y_te, p)))
        chosen.append(best_hp)

    from collections import Counter
    counts = Counter(chosen)
    return float(np.mean(outer_auc)), float(np.std(outer_auc)), counts

def fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    score, perm_scores, pvalue = permutation_test_score(
        pipe, Xdf, y,
        scoring="roc_auc",
        cv=cv,
        n_permutations=n_perm,
        n_jobs=-1,
        random_state=seed
    )
    return float(score), float(pvalue)

def run_bundle(name, Xdf):
    print("\n======", name, "======")
    pipe = fixed_pipeline(C=1.0, penalty="l2")

    auc_loocv, ci = loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0)
    print("LOOCV AUC:", auc_loocv, "Bootstrap 95% CI:", ci)

    aucs, bals = repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42)
    print("RepCV AUC mean±sd:", float(aucs.mean()), float(aucs.std()))
    print("RepCV BAL mean±sd:", float(bals.mean()), float(bals.std()))

    outer_mean, outer_sd, counts = minimal_nested_tuning(Xdf, y, outer_repeats=50)
    print("Nested (tune C/penalty only) outer AUC mean±sd:", outer_mean, outer_sd)
    print("Top hyperparams:", counts.most_common(5))

    score, pval = fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0)
    print("Permutation (5-fold) AUC score:", score, "p-value:", pval)

# Run models
run_bundle("L1-only (negative control)", X_L1)
run_bundle("Ln-only", X_Ln)
run_bundle("Delta-only", X_Delta)
run_bundle("FULL (L1+Ln+Delta)", X_Full)


In [None]:

# =========================
# PART 2 — ICA MODELING + VALIDATION
# Mirrors your PCA Part 2 exactly, but uses ICA feature files (IC-based names).
# =========================
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    LeaveOneOut,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_predict,
    permutation_test_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")  # <-- SAME AS PART 1
LABELS_CSV = ROOT / "proficiency_labels.csv"
K = 20

FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"   # used in Part 3
FEAT_DELTA_CONN = ROOT / f"features_delta_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
y = df["y"].to_numpy()

X_L1    = df.filter(regex=r"^L1_").copy()
X_Ln    = df.filter(regex=r"^Ln_").copy()
X_Delta = df.filter(regex=r"^DELTA_").copy()
X_Full  = df.drop(columns=["y"]).copy()

print("N aligned:", df.shape[0], "class counts:", labels["group"].value_counts().to_dict())
print("Shapes:", "L1", X_L1.shape, "Ln", X_Ln.shape, "Delta", X_Delta.shape, "Full", X_Full.shape)

def fixed_pipeline(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])

def loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0):
    loo = LeaveOneOut()
    prob = cross_val_predict(pipe, Xdf, y, cv=loo, method="predict_proba")[:, 1]
    auc = roc_auc_score(y, prob)

    rng = np.random.default_rng(seed)
    idx = np.arange(len(y))
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(idx, size=len(idx), replace=True)
        if len(np.unique(y[samp])) < 2:
            continue
        boots.append(roc_auc_score(y[samp], prob[samp]))
    boots = np.array(boots)
    ci_lo, ci_hi = np.quantile(boots, [0.025, 0.975])
    return float(auc), (float(ci_lo), float(ci_hi))

def repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    aucs, bals = [], []
    for tr, te in rskf.split(Xdf, y):
        pipe.fit(Xdf.iloc[tr], y[tr])
        p = pipe.predict_proba(Xdf.iloc[te])[:, 1]
        pred = (p >= 0.5).astype(int)
        aucs.append(roc_auc_score(y[te], p))
        bals.append(balanced_accuracy_score(y[te], pred))
    return np.array(aucs), np.array(bals)

def minimal_nested_tuning(Xdf, y,
                         C_grid=(0.001, 0.01, 0.1, 1, 10, 100, 1000),
                         penalty_grid=("l2", "l1"),
                         outer_splits=5, outer_repeats=100, inner_splits=5, seed=42):
    outer = RepeatedStratifiedKFold(n_splits=outer_splits, n_repeats=outer_repeats, random_state=seed)
    inner = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=seed)

    chosen = []
    outer_auc = []

    for tr, te in outer.split(Xdf, y):
        X_tr, X_te = Xdf.iloc[tr], Xdf.iloc[te]
        y_tr, y_te = y[tr], y[te]

        best_auc = -np.inf
        best_hp = None

        for pen in penalty_grid:
            for C in C_grid:
                pipe = fixed_pipeline(C=C, penalty=pen)
                inner_aucs = []
                for tr2, te2 in inner.split(X_tr, y_tr):
                    pipe.fit(X_tr.iloc[tr2], y_tr[tr2])
                    p = pipe.predict_proba(X_tr.iloc[te2])[:, 1]
                    inner_aucs.append(roc_auc_score(y_tr[te2], p))
                m = float(np.mean(inner_aucs))
                if m > best_auc:
                    best_auc, best_hp = m, (pen, C)

        pen, C = best_hp
        pipe = fixed_pipeline(C=C, penalty=pen)
        pipe.fit(X_tr, y_tr)
        p = pipe.predict_proba(X_te)[:, 1]
        outer_auc.append(float(roc_auc_score(y_te, p)))
        chosen.append(best_hp)

    from collections import Counter
    counts = Counter(chosen)
    return float(np.mean(outer_auc)), float(np.std(outer_auc)), counts

def fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    score, perm_scores, pvalue = permutation_test_score(
        pipe, Xdf, y,
        scoring="roc_auc",
        cv=cv,
        n_permutations=n_perm,
        n_jobs=-1,
        random_state=seed
    )
    return float(score), float(pvalue)

def run_bundle(name, Xdf):
    print("\n======", name, "======")
    pipe = fixed_pipeline(C=1.0, penalty="l2")

    auc_loocv, ci = loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0)
    print("LOOCV AUC:", auc_loocv, "Bootstrap 95% CI:", ci)

    aucs, bals = repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42)
    print("RepCV AUC mean±sd:", float(aucs.mean()), float(aucs.std()))
    print("RepCV BAL mean±sd:", float(bals.mean()), float(bals.std()))

    outer_mean, outer_sd, counts = minimal_nested_tuning(Xdf, y, outer_repeats=50)
    print("Nested (tune C/penalty only) outer AUC mean±sd:", outer_mean, outer_sd)
    print("Top hyperparams:", counts.most_common(5))

    score, pval = fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0)
    print("Permutation (5-fold) AUC score:", score, "p-value:", pval)

# Run models
run_bundle("L1-only (negative control)", X_L1)
run_bundle("Ln-only", X_Ln)
run_bundle("Delta-only", X_Delta)
run_bundle("FULL (L1+Ln+Delta)", X_Full)


In [None]:

# =========================
# PART 2 — ICA MODELING + VALIDATION
# Mirrors your PCA Part 2 exactly, but uses ICA feature files (IC-based names).
# =========================
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import (
    LeaveOneOut,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    cross_val_predict,
    permutation_test_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, balanced_accuracy_score

ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")  # <-- SAME AS PART 1
LABELS_CSV = ROOT / "proficiency_labels.csv"
K = 20

FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"   # used in Part 3
FEAT_DELTA_CONN = ROOT / f"features_delta_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
y = df["y"].to_numpy()

X_L1    = df.filter(regex=r"^L1_").copy()
X_Ln    = df.filter(regex=r"^Ln_").copy()
X_Delta = df.filter(regex=r"^DELTA_").copy()
X_Full  = df.drop(columns=["y"]).copy()

print("N aligned:", df.shape[0], "class counts:", labels["group"].value_counts().to_dict())
print("Shapes:", "L1", X_L1.shape, "Ln", X_Ln.shape, "Delta", X_Delta.shape, "Full", X_Full.shape)

def fixed_pipeline(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])

def loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0):
    loo = LeaveOneOut()
    prob = cross_val_predict(pipe, Xdf, y, cv=loo, method="predict_proba")[:, 1]
    auc = roc_auc_score(y, prob)

    rng = np.random.default_rng(seed)
    idx = np.arange(len(y))
    boots = []
    for _ in range(n_boot):
        samp = rng.choice(idx, size=len(idx), replace=True)
        if len(np.unique(y[samp])) < 2:
            continue
        boots.append(roc_auc_score(y[samp], prob[samp]))
    boots = np.array(boots)
    ci_lo, ci_hi = np.quantile(boots, [0.025, 0.975])
    return float(auc), (float(ci_lo), float(ci_hi))

def repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    aucs, bals = [], []
    for tr, te in rskf.split(Xdf, y):
        pipe.fit(Xdf.iloc[tr], y[tr])
        p = pipe.predict_proba(Xdf.iloc[te])[:, 1]
        pred = (p >= 0.5).astype(int)
        aucs.append(roc_auc_score(y[te], p))
        bals.append(balanced_accuracy_score(y[te], pred))
    return np.array(aucs), np.array(bals)

def minimal_nested_tuning(Xdf, y,
                         C_grid=(0.001, 0.01, 0.1, 1, 10, 100, 1000),
                         penalty_grid=("l2", "l1"),
                         outer_splits=5, outer_repeats=100, inner_splits=5, seed=42):
    outer = RepeatedStratifiedKFold(n_splits=outer_splits, n_repeats=outer_repeats, random_state=seed)
    inner = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=seed)

    chosen = []
    outer_auc = []

    for tr, te in outer.split(Xdf, y):
        X_tr, X_te = Xdf.iloc[tr], Xdf.iloc[te]
        y_tr, y_te = y[tr], y[te]

        best_auc = -np.inf
        best_hp = None

        for pen in penalty_grid:
            for C in C_grid:
                pipe = fixed_pipeline(C=C, penalty=pen)
                inner_aucs = []
                for tr2, te2 in inner.split(X_tr, y_tr):
                    pipe.fit(X_tr.iloc[tr2], y_tr[tr2])
                    p = pipe.predict_proba(X_tr.iloc[te2])[:, 1]
                    inner_aucs.append(roc_auc_score(y_tr[te2], p))
                m = float(np.mean(inner_aucs))
                if m > best_auc:
                    best_auc, best_hp = m, (pen, C)

        pen, C = best_hp
        pipe = fixed_pipeline(C=C, penalty=pen)
        pipe.fit(X_tr, y_tr)
        p = pipe.predict_proba(X_te)[:, 1]
        outer_auc.append(float(roc_auc_score(y_te, p)))
        chosen.append(best_hp)

    from collections import Counter
    counts = Counter(chosen)
    return float(np.mean(outer_auc)), float(np.std(outer_auc)), counts

def fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    score, perm_scores, pvalue = permutation_test_score(
        pipe, Xdf, y,
        scoring="roc_auc",
        cv=cv,
        n_permutations=n_perm,
        n_jobs=-1,
        random_state=seed
    )
    return float(score), float(pvalue)

def run_bundle(name, Xdf):
    print("\n======", name, "======")
    pipe = fixed_pipeline(C=1.0, penalty="l2")

    auc_loocv, ci = loocv_auc_with_bootstrap_ci(Xdf, y, pipe, n_boot=5000, seed=0)
    print("LOOCV AUC:", auc_loocv, "Bootstrap 95% CI:", ci)

    aucs, bals = repeated_cv_auc_dist(Xdf, y, pipe, n_splits=5, n_repeats=200, seed=42)
    print("RepCV AUC mean±sd:", float(aucs.mean()), float(aucs.std()))
    print("RepCV BAL mean±sd:", float(bals.mean()), float(bals.std()))

    outer_mean, outer_sd, counts = minimal_nested_tuning(Xdf, y, outer_repeats=50)
    print("Nested (tune C/penalty only) outer AUC mean±sd:", outer_mean, outer_sd)
    print("Top hyperparams:", counts.most_common(5))

    score, pval = fast_permutation_pvalue(Xdf, y, pipe, n_perm=2000, seed=0)
    print("Permutation (5-fold) AUC score:", score, "p-value:", pval)

# Run models
run_bundle("L1-only (negative control)", X_L1)
run_bundle("Ln-only", X_Ln)
run_bundle("Delta-only", X_Delta)
run_bundle("FULL (L1+Ln+Delta)", X_Full)
