# ICA decoding — Part 1 (Feature building)
This notebook mirrors your PCA Part 1, but uses ICA timecourses (`*_ICsK{K}.*`).

**Run this first** to generate:
- `features_static_nonZ_K20.csv`
- `features_ln_conn_pearsonZ_K20.csv`
- `features_delta_conn_pearsonZ_K20.csv`
- `feature_build_manifest_K20.csv`


## 0) Imports + settings

In [1]:

# =========================
# PART 1 — ICA FEATURE BUILDING (UPDATED: saves Ln connectivity too)
# Mirrors your PCA Part 1 exactly, but uses IC naming + file pattern *_ICsK{K}.*
# =========================

from pathlib import Path
import re
import numpy as np
import pandas as pd
from typing import Optional

# ---- USER SETTINGS ----
ROOT = Path(r"/Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA")  # <-- CHANGE THIS IF NEEDED
TC_DIR = ROOT / "timecourses"
LABELS_CSV = ROOT / "proficiency_labels.csv"  # must exist in ROOT
TASK_L1 = "compL1"
TASK_LN = "compLn"
K = 20                                   # <-- ICA K (change later for 5,10,15,25,30...)
TR: Optional[float] = None               # set to TR (seconds) or keep None

OUT_STATIC      = ROOT / f"features_static_nonZ_K{K}.csv"
OUT_DELTA_CONN  = ROOT / f"features_delta_conn_pearsonZ_K{K}.csv"
OUT_LN_CONN     = ROOT / f"features_ln_conn_pearsonZ_K{K}.csv"
OUT_MANIFEST    = ROOT / f"feature_build_manifest_K{K}.csv"


# ---- FILE DISCOVERY ----
def discover_timecourses(tc_dir: Path, K: int):
    """
    Finds ICA timecourse files like:
      sub-01_task-compL1_ICsK20.npy  or  .csv
    Returns dict: data[sub][task] = {"npy": path or None, "csv": path or None}
    """
    patt = re.compile(r"(sub-\d+)_task-([A-Za-z0-9]+)_ICsK(\d+)\.(npy|csv)$")
    data = {}
    for p in tc_dir.iterdir():
        m = patt.match(p.name)
        if not m:
            continue
        sub, task, k_str, ext = m.group(1), m.group(2), m.group(3), m.group(4)
        if int(k_str) != K:
            continue
        data.setdefault(sub, {}).setdefault(task, {"npy": None, "csv": None})
        data[sub][task][ext] = p
    return data


def load_tc(entry: dict) -> np.ndarray:
    """
    Prefer npy if present.
    Returns X shape (T, K)
    """
    if entry.get("npy") is not None and entry["npy"].exists():
        X = np.load(entry["npy"]).astype(np.float64)
    elif entry.get("csv") is not None and entry["csv"].exists():
        X = pd.read_csv(entry["csv"]).values.astype(np.float64)
    else:
        raise FileNotFoundError("No npy/csv found for this entry.")

    if X.ndim != 2:
        raise ValueError(f"Bad timecourse shape {X.shape}")
    return X


# ---- FEATURE FUNCTIONS ----
def lag1_autocorr(x: np.ndarray) -> float:
    x0 = x[:-1]
    x1 = x[1:]
    s0 = x0.std()
    s1 = x1.std()
    if s0 == 0 or s1 == 0:
        return 0.0
    return float(np.corrcoef(x0, x1)[0, 1])


def spectral_low_ratio(x: np.ndarray, tr: float, f_low=(0.01, 0.1)) -> float:
    """
    Ratio of power in [0.01,0.1] Hz to total power (excluding DC).
    Requires TR (seconds). If TR is None, we skip this feature.
    """
    x = x - x.mean()
    n = len(x)
    freqs = np.fft.rfftfreq(n, d=tr)
    Xf = np.fft.rfft(x)
    psd = (np.abs(Xf) ** 2)

    # exclude DC (0 Hz)
    valid = freqs > 0
    freqs = freqs[valid]
    psd = psd[valid]
    if psd.sum() == 0:
        return 0.0

    band = (freqs >= f_low[0]) & (freqs <= f_low[1])
    return float(psd[band].sum() / psd.sum())


def per_ic_features(X: np.ndarray, prefix: str, tr: Optional[float]) -> dict:
    """
    X: (T, K)
    Per IC features that survive centering:
      - logstd
      - ac1
      - lowRatio (optional if TR is provided)
    """
    eps = 1e-12
    feats = {}
    for k in range(X.shape[1]):
        ic = f"IC{k+1:02d}"
        col = X[:, k]
        std = col.std(ddof=0)
        feats[f"{prefix}_{ic}_logstd"] = float(np.log(std + eps))
        feats[f"{prefix}_{ic}_ac1"] = lag1_autocorr(col)
        if tr is not None:
            feats[f"{prefix}_{ic}_lowRatio"] = spectral_low_ratio(col, tr=tr)
    return feats


def fisher_z_corr(X: np.ndarray) -> np.ndarray:
    """
    Pearson correlation matrix -> Fisher z transform, clipped.
    """
    C = np.corrcoef(X, rowvar=False)
    np.fill_diagonal(C, 0.0)
    C = np.clip(C, -0.999999, 0.999999)
    Z = np.arctanh(C)
    np.fill_diagonal(Z, 0.0)
    return Z


def upper_triangle_features(M: np.ndarray, prefix: str) -> dict:
    """
    Flatten upper triangle (i<j) into named features.
    """
    K = M.shape[0]
    feats = {}
    for i in range(K):
        for j in range(i + 1, K):
            feats[f"{prefix}_IC{i+1:02d}_IC{j+1:02d}"] = float(M[i, j])
    return feats


# ---- BUILD FEATURE TABLES ----
tc_index = discover_timecourses(TC_DIR, K=K)
subjects = sorted(tc_index.keys())

rows_static = []
rows_conn_delta = []
rows_conn_ln = []          # Ln connectivity
manifest_rows = []
missing_pairs = 0

for sub in subjects:
    if TASK_L1 not in tc_index[sub] or TASK_LN not in tc_index[sub]:
        missing_pairs += 1
        continue

    X1 = load_tc(tc_index[sub][TASK_L1])
    X2 = load_tc(tc_index[sub][TASK_LN])

    if X1.shape[1] != K or X2.shape[1] != K:
        raise ValueError(f"{sub} wrong K: {X1.shape}, {X2.shape}")

    # static features per task
    f1 = per_ic_features(X1, "L1", tr=TR)
    f2 = per_ic_features(X2, "Ln", tr=TR)

    # delta features (Ln - L1) for each feature
    delta = {}
    for key, v in f1.items():
        ln_key = key.replace("L1_", "Ln_")
        delta[key.replace("L1_", "DELTA_")] = f2[ln_key] - v

    row = {"subject": sub}
    row.update(f1)
    row.update(f2)
    row.update(delta)
    rows_static.append(row)

    # connectivity: Ln, L1, and DELTA
    Z1 = fisher_z_corr(X1)
    Z2 = fisher_z_corr(X2)
    ZD = Z2 - Z1

    row_ln = {"subject": sub}
    row_ln.update(upper_triangle_features(Z2, prefix="Ln_zcorr"))
    rows_conn_ln.append(row_ln)

    rowd = {"subject": sub}
    rowd.update(upper_triangle_features(ZD, prefix="DELTA_zcorr"))
    rows_conn_delta.append(rowd)

    manifest_rows.append({
        "subject": sub,
        "T_L1": X1.shape[0],
        "T_Ln": X2.shape[0],
        "has_TR_features": TR is not None
    })

static_df = pd.DataFrame(rows_static).set_index("subject").sort_index()
ln_conn_df = pd.DataFrame(rows_conn_ln).set_index("subject").sort_index()
delta_conn_df = pd.DataFrame(rows_conn_delta).set_index("subject").sort_index()
manifest_df = pd.DataFrame(manifest_rows)

print("Built static feature table:", static_df.shape)
print("Built Ln connectivity table:", ln_conn_df.shape)
print("Built DELTA connectivity table:", delta_conn_df.shape)
print("Missing L1/Ln pairs:", missing_pairs)

static_df.to_csv(OUT_STATIC)
ln_conn_df.to_csv(OUT_LN_CONN)
delta_conn_df.to_csv(OUT_DELTA_CONN)
manifest_df.to_csv(OUT_MANIFEST, index=False)

print("Saved:", OUT_STATIC)
print("Saved:", OUT_LN_CONN)
print("Saved:", OUT_DELTA_CONN)
print("Saved:", OUT_MANIFEST)


Built static feature table: (26, 120)
Built Ln connectivity table: (26, 190)
Built DELTA connectivity table: (26, 190)
Missing L1/Ln pairs: 0
Saved: /Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA/features_static_nonZ_K20.csv
Saved: /Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA/features_ln_conn_pearsonZ_K20.csv
Saved: /Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA/features_delta_conn_pearsonZ_K20.csv
Saved: /Users/onilarasanjala/Desktop/TSeme/CogNeuSci/CodeData/NewICA/feature_build_manifest_K20.csv
