# ICA decoding — Part 3 (Ln-only + top-IC Ln connectivity)
This mirrors your PCA Part 3 exactly:
1) Train Ln-only model on TRAIN folds
2) Pick **top 5 ICs** from TRAIN-only weights
3) Add Ln connectivity edges among those 5 ICs
4) Evaluate with repeated CV + **manual permutation test** (leakage-safe)


In [None]:

# =========================
# PART 3 — LN-ONLY + TOP-IC LN CONNECTIVITY (LEAKAGE-SAFE)
# + Manual permutation test (publication-friendly; avoids sklearn tag issues)
# Mirrors your PCA Part 3, but uses IC naming.
# =========================

from pathlib import Path
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


# -------------------------
# Basic model builder
# -------------------------
def fixed_pipe(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])


# -------------------------
# Utilities
# -------------------------
def ic_group_importance_from_coef(feature_names, coef):
    """Sum |coef| by IC index extracted from names like Ln_IC07_..."""
    ic_re = re.compile(r"IC(\d{2})")
    scores = {}
    for name, w in zip(feature_names, coef):
        m = ic_re.search(name)
        if not m:
            continue
        ic = int(m.group(1))
        scores[ic] = scores.get(ic, 0.0) + abs(float(w))
    return scores

def connectivity_cols_for_ics(ics, prefix="Ln_zcorr"):
    ics = sorted(ics)
    cols = []
    for i in range(len(ics)):
        for j in range(i + 1, len(ics)):
            a, b = ics[i], ics[j]
            cols.append(f"{prefix}_IC{a:02d}_IC{b:02d}")
    return cols

def sanity_check_inputs(Xbase: pd.DataFrame, conn: pd.DataFrame, y: np.ndarray, conn_prefix: str):
    assert isinstance(Xbase, pd.DataFrame) and isinstance(conn, pd.DataFrame)
    assert len(Xbase) == len(y), "Xbase and y length mismatch"
    assert Xbase.index.equals(conn.index), "Xbase and conn indices must match exactly"
    assert set(np.unique(y)).issubset({0, 1}), "y must be binary 0/1"

    pref_cols = [c for c in conn.columns if c.startswith(conn_prefix + "_IC")]
    if len(pref_cols) == 0:
        raise ValueError(f"No connectivity columns found with prefix='{conn_prefix}'.")
    print(f"[OK] Found {len(pref_cols)} connectivity columns with prefix '{conn_prefix}'.")


# -------------------------
# Core: single CV run (supports either real y or permuted y)
# -------------------------
def cv_auc_foldwise_augmented(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    splits,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    return_debug: bool = False
):
    aucs = []
    picked_sets = []
    edge_use_counter = Counter()
    missing_edges_total = 0

    for tr, te in splits:
        Xtr, Xte = Xbase.iloc[tr], Xbase.iloc[te]
        ytr, yte = y[tr], y[te]

        # 1) pick top ICs from TRAIN only (Ln-only)
        base_model = fixed_pipe(C=C_base, penalty=penalty)
        base_model.fit(Xtr, ytr)
        w = base_model.named_steps["clf"].coef_.ravel()
        scores = ic_group_importance_from_coef(Xtr.columns, w)
        top = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_m]
        top_ics = [ic for ic, _ in top]
        picked_sets.append(tuple(top_ics))

        # 2) add connectivity edges among those ICs
        wanted_cols = connectivity_cols_for_ics(top_ics, prefix=conn_prefix)
        cols = [c for c in wanted_cols if c in conn.columns]
        missing_edges_total += (len(wanted_cols) - len(cols))
        edge_use_counter.update(cols)

        Xtr_aug = pd.concat([Xtr, conn.iloc[tr][cols]], axis=1)
        Xte_aug = pd.concat([Xte, conn.iloc[te][cols]], axis=1)

        # 3) fit final model and score AUC
        final_model = fixed_pipe(C=C_final, penalty=penalty)
        final_model.fit(Xtr_aug, ytr)
        p = final_model.predict_proba(Xte_aug)[:, 1]

        # If test fold has only one class, AUC undefined -> skip fold
        if len(np.unique(yte)) < 2:
            continue

        aucs.append(roc_auc_score(yte, p))

    auc_mean = float(np.mean(aucs))

    if not return_debug:
        return auc_mean

    n_folds = len(aucs)
    return {
        "auc_mean": auc_mean,
        "auc_sd": float(np.std(aucs)),
        "n_folds": int(n_folds),
        "most_common_ic_sets": Counter(picked_sets).most_common(10),
        "most_used_edges": edge_use_counter.most_common(15),
        "avg_missing_edges_per_fold": float(missing_edges_total / n_folds) if n_folds > 0 else float("nan")
    }


# -------------------------
# Repeated CV summary
# -------------------------
def repeated_cv_summary_augmented(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    n_splits: int = 5,
    n_repeats: int = 200,
    seed: int = 42
):
    sanity_check_inputs(Xbase, conn, y, conn_prefix=conn_prefix)
    expected_edges = top_m * (top_m - 1) // 2
    print(f"[INFO] top_m={top_m} -> expected edges per fold = C({top_m},2) = {expected_edges}")

    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    splits = list(rskf.split(Xbase, y))

    out = cv_auc_foldwise_augmented(
        Xbase, conn, y, splits,
        conn_prefix=conn_prefix,
        top_m=top_m,
        C_base=C_base,
        C_final=C_final,
        penalty=penalty,
        return_debug=True
    )

    print(f"[INFO] Avg missing edges per fold: {out['avg_missing_edges_per_fold']:.3f} (should be ~0)")
    out["top_m"] = int(top_m)
    out["expected_edges_per_fold"] = int(expected_edges)
    return out


# -------------------------
# Manual permutation test (publication-friendly)
# -------------------------
def permutation_test_augmented_auc(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    n_splits: int = 5,
    n_perm: int = 2000,
    seed: int = 0,
    verbose_every: int = 200
):
    """
    Returns:
      observed_auc, p_value, perm_aucs (array)
    p-value computed as (1 + #perm >= obs) / (1 + n_perm)
    """
    sanity_check_inputs(Xbase, conn, y, conn_prefix=conn_prefix)

    # Fix CV splits once (important!)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    splits = list(cv.split(Xbase, y))

    # Observed score
    obs = cv_auc_foldwise_augmented(
        Xbase, conn, y, splits,
        conn_prefix=conn_prefix,
        top_m=top_m,
        C_base=C_base,
        C_final=C_final,
        penalty=penalty,
        return_debug=False
    )

    rng = np.random.default_rng(seed)
    perm_scores = np.empty(n_perm, dtype=float)

    for i in range(n_perm):
        y_perm = rng.permutation(y)  # preserves class counts automatically
        perm_scores[i] = cv_auc_foldwise_augmented(
            Xbase, conn, y_perm, splits,
            conn_prefix=conn_prefix,
            top_m=top_m,
            C_base=C_base,
            C_final=C_final,
            penalty=penalty,
            return_debug=False
        )
        if verbose_every and (i + 1) % verbose_every == 0:
            print(f"[perm] {i+1}/{n_perm} done...")

    p = (1.0 + np.sum(perm_scores >= obs)) / (1.0 + n_perm)
    return float(obs), float(p), perm_scores


# -------------------------
# RUN (Ln-only + Ln connectivity)
# -------------------------
ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")
K = 20

LABELS_CSV = ROOT / "proficiency_labels.csv"
FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")
connLn = pd.read_csv(FEAT_LN_CONN).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
X_Ln = df.filter(regex=r"^Ln_").copy()

# Align (exact index match)
common = X_Ln.index.intersection(connLn.index)
X_base = X_Ln.loc[common].copy()
y_base = labels.loc[common, "y"].to_numpy()
connLn = connLn.loc[common].copy()
connLn = connLn.loc[X_base.index]

print("\n=== Repeated CV summary (sanity-checked): Ln-only base + top-IC Ln connectivity ===")
aug_ln = repeated_cv_summary_augmented(
    Xbase=X_base,
    conn=connLn,
    y=y_base,
    conn_prefix="Ln_zcorr",
    top_m=5,
    C_base=1.0,
    C_final=1.0,
    penalty="l2",
    n_splits=5,
    n_repeats=200,
    seed=42
)
print(aug_ln)

print("\n=== Manual permutation test (leakage-safe) for augmented Ln model ===")
obs_auc, pval, perm_aucs = permutation_test_augmented_auc(
    Xbase=X_base,
    conn=connLn,
    y=y_base,
    conn_prefix="Ln_zcorr",
    top_m=5,
    C_base=1.0,
    C_final=1.0,
    penalty="l2",
    n_splits=5,
    n_perm=2000,     # start 500 if you want quick; 2000 for final
    seed=0,
    verbose_every=200
)
print("Observed 5-fold AUC:", obs_auc)
print("Permutation p-value:", pval)


In [None]:

# =========================
# PART 3 — LN-ONLY + TOP-IC LN CONNECTIVITY (LEAKAGE-SAFE)
# + Manual permutation test (publication-friendly; avoids sklearn tag issues)
# Mirrors your PCA Part 3, but uses IC naming.
# =========================

from pathlib import Path
import numpy as np
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


# -------------------------
# Basic model builder
# -------------------------
def fixed_pipe(C=1.0, penalty="l2"):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=5000,
            C=C,
            penalty=penalty
        ))
    ])


# -------------------------
# Utilities
# -------------------------
def ic_group_importance_from_coef(feature_names, coef):
    """Sum |coef| by IC index extracted from names like Ln_IC07_..."""
    ic_re = re.compile(r"IC(\d{2})")
    scores = {}
    for name, w in zip(feature_names, coef):
        m = ic_re.search(name)
        if not m:
            continue
        ic = int(m.group(1))
        scores[ic] = scores.get(ic, 0.0) + abs(float(w))
    return scores

def connectivity_cols_for_ics(ics, prefix="Ln_zcorr"):
    ics = sorted(ics)
    cols = []
    for i in range(len(ics)):
        for j in range(i + 1, len(ics)):
            a, b = ics[i], ics[j]
            cols.append(f"{prefix}_IC{a:02d}_IC{b:02d}")
    return cols

def sanity_check_inputs(Xbase: pd.DataFrame, conn: pd.DataFrame, y: np.ndarray, conn_prefix: str):
    assert isinstance(Xbase, pd.DataFrame) and isinstance(conn, pd.DataFrame)
    assert len(Xbase) == len(y), "Xbase and y length mismatch"
    assert Xbase.index.equals(conn.index), "Xbase and conn indices must match exactly"
    assert set(np.unique(y)).issubset({0, 1}), "y must be binary 0/1"

    pref_cols = [c for c in conn.columns if c.startswith(conn_prefix + "_IC")]
    if len(pref_cols) == 0:
        raise ValueError(f"No connectivity columns found with prefix='{conn_prefix}'.")
    print(f"[OK] Found {len(pref_cols)} connectivity columns with prefix '{conn_prefix}'.")


# -------------------------
# Core: single CV run (supports either real y or permuted y)
# -------------------------
def cv_auc_foldwise_augmented(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    splits,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    return_debug: bool = False
):
    aucs = []
    picked_sets = []
    edge_use_counter = Counter()
    missing_edges_total = 0

    for tr, te in splits:
        Xtr, Xte = Xbase.iloc[tr], Xbase.iloc[te]
        ytr, yte = y[tr], y[te]

        # 1) pick top ICs from TRAIN only (Ln-only)
        base_model = fixed_pipe(C=C_base, penalty=penalty)
        base_model.fit(Xtr, ytr)
        w = base_model.named_steps["clf"].coef_.ravel()
        scores = ic_group_importance_from_coef(Xtr.columns, w)
        top = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:top_m]
        top_ics = [ic for ic, _ in top]
        picked_sets.append(tuple(top_ics))

        # 2) add connectivity edges among those ICs
        wanted_cols = connectivity_cols_for_ics(top_ics, prefix=conn_prefix)
        cols = [c for c in wanted_cols if c in conn.columns]
        missing_edges_total += (len(wanted_cols) - len(cols))
        edge_use_counter.update(cols)

        Xtr_aug = pd.concat([Xtr, conn.iloc[tr][cols]], axis=1)
        Xte_aug = pd.concat([Xte, conn.iloc[te][cols]], axis=1)

        # 3) fit final model and score AUC
        final_model = fixed_pipe(C=C_final, penalty=penalty)
        final_model.fit(Xtr_aug, ytr)
        p = final_model.predict_proba(Xte_aug)[:, 1]

        # If test fold has only one class, AUC undefined -> skip fold
        if len(np.unique(yte)) < 2:
            continue

        aucs.append(roc_auc_score(yte, p))

    auc_mean = float(np.mean(aucs))

    if not return_debug:
        return auc_mean

    n_folds = len(aucs)
    return {
        "auc_mean": auc_mean,
        "auc_sd": float(np.std(aucs)),
        "n_folds": int(n_folds),
        "most_common_ic_sets": Counter(picked_sets).most_common(10),
        "most_used_edges": edge_use_counter.most_common(15),
        "avg_missing_edges_per_fold": float(missing_edges_total / n_folds) if n_folds > 0 else float("nan")
    }


# -------------------------
# Repeated CV summary
# -------------------------
def repeated_cv_summary_augmented(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    n_splits: int = 5,
    n_repeats: int = 200,
    seed: int = 42
):
    sanity_check_inputs(Xbase, conn, y, conn_prefix=conn_prefix)
    expected_edges = top_m * (top_m - 1) // 2
    print(f"[INFO] top_m={top_m} -> expected edges per fold = C({top_m},2) = {expected_edges}")

    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
    splits = list(rskf.split(Xbase, y))

    out = cv_auc_foldwise_augmented(
        Xbase, conn, y, splits,
        conn_prefix=conn_prefix,
        top_m=top_m,
        C_base=C_base,
        C_final=C_final,
        penalty=penalty,
        return_debug=True
    )

    print(f"[INFO] Avg missing edges per fold: {out['avg_missing_edges_per_fold']:.3f} (should be ~0)")
    out["top_m"] = int(top_m)
    out["expected_edges_per_fold"] = int(expected_edges)
    return out


# -------------------------
# Manual permutation test (publication-friendly)
# -------------------------
def permutation_test_augmented_auc(
    Xbase: pd.DataFrame,
    conn: pd.DataFrame,
    y: np.ndarray,
    conn_prefix: str = "Ln_zcorr",
    top_m: int = 5,
    C_base: float = 1.0,
    C_final: float = 1.0,
    penalty: str = "l2",
    n_splits: int = 5,
    n_perm: int = 2000,
    seed: int = 0,
    verbose_every: int = 200
):
    """
    Returns:
      observed_auc, p_value, perm_aucs (array)
    p-value computed as (1 + #perm >= obs) / (1 + n_perm)
    """
    sanity_check_inputs(Xbase, conn, y, conn_prefix=conn_prefix)

    # Fix CV splits once (important!)
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    splits = list(cv.split(Xbase, y))

    # Observed score
    obs = cv_auc_foldwise_augmented(
        Xbase, conn, y, splits,
        conn_prefix=conn_prefix,
        top_m=top_m,
        C_base=C_base,
        C_final=C_final,
        penalty=penalty,
        return_debug=False
    )

    rng = np.random.default_rng(seed)
    perm_scores = np.empty(n_perm, dtype=float)

    for i in range(n_perm):
        y_perm = rng.permutation(y)  # preserves class counts automatically
        perm_scores[i] = cv_auc_foldwise_augmented(
            Xbase, conn, y_perm, splits,
            conn_prefix=conn_prefix,
            top_m=top_m,
            C_base=C_base,
            C_final=C_final,
            penalty=penalty,
            return_debug=False
        )
        if verbose_every and (i + 1) % verbose_every == 0:
            print(f"[perm] {i+1}/{n_perm} done...")

    p = (1.0 + np.sum(perm_scores >= obs)) / (1.0 + n_perm)
    return float(obs), float(p), perm_scores


# -------------------------
# RUN (Ln-only + Ln connectivity)
# -------------------------
ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")
K = 20

LABELS_CSV = ROOT / "proficiency_labels.csv"
FEAT_STATIC = ROOT / f"features_static_nonZ_K{K}.csv"
FEAT_LN_CONN = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"

labels = pd.read_csv(LABELS_CSV).set_index("subject")
labels["group"] = labels["group"].str.lower().str.strip()
labels["y"] = (labels["group"] == "advanced").astype(int)

Xstatic = pd.read_csv(FEAT_STATIC).set_index("subject")
connLn = pd.read_csv(FEAT_LN_CONN).set_index("subject")

df = Xstatic.join(labels[["y"]], how="inner")
X_Ln = df.filter(regex=r"^Ln_").copy()

# Align (exact index match)
common = X_Ln.index.intersection(connLn.index)
X_base = X_Ln.loc[common].copy()
y_base = labels.loc[common, "y"].to_numpy()
connLn = connLn.loc[common].copy()
connLn = connLn.loc[X_base.index]

print("\n=== Repeated CV summary (sanity-checked): Ln-only base + top-IC Ln connectivity ===")
aug_ln = repeated_cv_summary_augmented(
    Xbase=X_base,
    conn=connLn,
    y=y_base,
    conn_prefix="Ln_zcorr",
    top_m=5,
    C_base=1.0,
    C_final=1.0,
    penalty="l2",
    n_splits=5,
    n_repeats=200,
    seed=42
)
print(aug_ln)

print("\n=== Manual permutation test (leakage-safe) for augmented Ln model ===")
obs_auc, pval, perm_aucs = permutation_test_augmented_auc(
    Xbase=X_base,
    conn=connLn,
    y=y_base,
    conn_prefix="Ln_zcorr",
    top_m=5,
    C_base=1.0,
    C_final=1.0,
    penalty="l2",
    n_splits=5,
    n_perm=2000,     # start 500 if you want quick; 2000 for final
    seed=0,
    verbose_every=200
)
print("Observed 5-fold AUC:", obs_auc)
print("Permutation p-value:", pval)
