# Healthy vs. PDs — Group Reproducibility & Similarity Analyses

**What it does:**
- Loads session-wise **KSG binary adjacency matrices** (per-edge permutation test; no across-edge FDR).
- Builds **group edge-presence matrices** 
- Defines **robust edges** at 70% and 90% presence
- Computes **Healthy–PD difference maps** 
- Computes **Jaccard overlaps** with Healthy 
- Generates **overlay figures** of robust backbones

**Inputs required:**
- Precomputed session-level KSG adjacency binaries (`*_ksg_binary.npy`) saved under `Results/ksg_results/`.
- `subject_session_metadata.csv` for mapping sessions to groups.

**Outputs:**
- CSVs with presence matrices and overlap metrics.
- Figures for group difference maps and overlays.
- Values that appear in **Table 4.X** and **Figure 4.X** of the thesis.

**Notes:**
- No across-edge FDR is applied here.
- Presence thresholds are 70% and 90% (50% is computed but not emphasized).


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import re

# Paths
BASE    = Path("/lustre/majlepy2/myproject")
META    = BASE / "subject_session_metadata.csv"
KSGDIR  = BASE / "Results/ksg_results"
OUTDIR  = Path("/home/majlepy2/myproject/Step-wise/figs")
OUTDIR.mkdir(exist_ok=True, parents=True)

# Load metadata
meta = pd.read_csv(META)
meta["sub_ses"] = meta["subject"] + "_" + meta["session"]

# Build lookup group -> session files
bin_files = sorted(KSGDIR.glob("*_combined_ksg_binary.npy"))
sessions = {}
for f in bin_files:
    stem = f.name.replace("_combined_ksg_binary.npy", "")
    sessions[stem] = f
print(f"Found {len(sessions)} binary adjacency matrices.")


Found 36 binary adjacency matrices.


In [2]:
# Collect files per group (Healthy, PD-off, PD-on)
group_bins = {"healthy": [], "PD-off": [], "PD-on": []}

for _, row in meta.iterrows():
    sub_ses = f"{row['subject']}_{row['session']}"
    f = sessions.get(sub_ses)
    if f and row["group"] in group_bins:
        group_bins[row["group"]].append(f)

for g, files in group_bins.items():
    print(f"{g}: {len(files)} sessions")

# Load into arrays
def load_stack(files):
    mats = [np.load(f).astype(np.int8) for f in files]
    return np.stack(mats, axis=0) if mats else None

mats_H   = load_stack(group_bins["healthy"])
mats_OFF = load_stack(group_bins["PD-off"])
mats_ON  = load_stack(group_bins["PD-on"])

S_H, N, _   = mats_H.shape
S_OFF, _, _ = mats_OFF.shape
S_ON, _, _  = mats_ON.shape
print(f"S_H={S_H}, S_OFF={S_OFF}, S_ON={S_ON}, N={N}")


healthy: 12 sessions
PD-off: 12 sessions
PD-on: 12 sessions
S_H=12, S_OFF=12, S_ON=12, N=23


In [3]:
import matplotlib.pyplot as plt

def independent_edge_test(mats_A, mats_B, label_A="GroupA", label_B="GroupB",
                          n_perm=20000, alpha=0.05, outstem="test"):
    """
    Compare edge presence between two independent groups (Healthy vs PD).
    """
    SA, N, _ = mats_A.shape
    SB, _, _ = mats_B.shape

    # Presence fractions per edge
    mean_A = mats_A.mean(axis=0)
    mean_B = mats_B.mean(axis=0)
    obs = mean_A - mean_B   # observed Healthy−PD
    
    # Candidate edges = off-diagonal, present at least once
    diag = np.eye(N, dtype=bool)
    ever = (mats_A.sum(axis=0) + mats_B.sum(axis=0)) > 0
    mask = (~diag) & ever
    E = mask.sum()
    print(f"Testing {E} candidate edges")

    # Flatten for permutation
    obs_vec = obs[mask]
    all_mats = np.concatenate([mats_A, mats_B], axis=0)  # (S_total, N, N)
    labels = np.array([0]*SA + [1]*SB)

    # Permutations
    rng = np.random.default_rng(20250829)
    perm_means = np.empty((n_perm, E), dtype=np.float32)
    for i in range(n_perm):
        perm_labels = rng.permutation(labels)
        grpA = all_mats[perm_labels==0].mean(axis=0)
        grpB = all_mats[perm_labels==1].mean(axis=0)
        perm_means[i] = (grpA - grpB)[mask]

    # Two-tailed p-values
    abs_obs  = np.abs(obs_vec)[None, :]
    abs_perm = np.abs(perm_means)
    hits = (abs_perm >= abs_obs).sum(axis=0)
    pvals = (hits + 1.0) / (n_perm + 1.0)

    # BH–FDR
    order = np.argsort(pvals)
    ranks = np.arange(1, len(pvals)+1)
    thresh = alpha * ranks / len(pvals)
    passed = pvals[order] <= thresh
    sig_vec = np.zeros_like(pvals, dtype=bool)
    if passed.any():
        kmax = np.max(np.where(passed)[0])
        cutoff = thresh[kmax]
        sig_vec = pvals <= cutoff

    # Counts
    sig_mat = np.zeros((N,N), dtype=bool); sig_mat[mask] = sig_vec
    vals = obs[sig_mat]
    total = vals.size
    HgtPD = int((vals > 0).sum())
    PDgtH = int((vals < 0).sum())
    print(f"Significant edges: total={total}, {label_A}>{label_B}={HgtPD}, {label_B}>{label_A}={PDgtH}")

    # Figure
    fig, ax = plt.subplots(figsize=(7,6), dpi=200)
    vlim = max(abs(obs.min()), abs(obs.max()))
    im = ax.imshow(obs, vmin=-vlim, vmax=+vlim, cmap="bwr", interpolation="nearest")
    alpha_mask = np.where(sig_mat, 1.0, 0.15)
    im.set_alpha(alpha_mask)
    plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label=f"Mean presence {label_A}−{label_B}")
    ax.set_title(f"{label_A} vs {label_B} (KSG presence)\nBH–FDR α={alpha}")
    ax.set_xlabel("Target"); ax.set_ylabel("Source")
    out = OUTDIR / f"{outstem}_KSG_diffmap.png"
    plt.tight_layout(); plt.savefig(out, bbox_inches="tight"); plt.close(fig)
    print(f"Saved: {out}")

    # Return summary
    return {"total": total, f"{label_A}>{label_B}": HgtPD, f"{label_B}>{label_A}": PDgtH,
            "pvals": pvals, "mask": mask, "sig": sig_vec, "obs": obs}


In [4]:
res_HvsOFF = independent_edge_test(mats_H, mats_OFF,
                                   label_A="Healthy", label_B="PD-off",
                                   outstem="Healthy_vs_PDoff")

res_HvsON  = independent_edge_test(mats_H, mats_ON,
                                   label_A="Healthy", label_B="PD-on",
                                   outstem="Healthy_vs_PDon")


Testing 506 candidate edges
Significant edges: total=3, Healthy>PD-off=2, PD-off>Healthy=1
Saved: /home/majlepy2/myproject/Step-wise/figs/Healthy_vs_PDoff_KSG_diffmap.png
Testing 506 candidate edges
Significant edges: total=2, Healthy>PD-on=0, PD-on>Healthy=2
Saved: /home/majlepy2/myproject/Step-wise/figs/Healthy_vs_PDon_KSG_diffmap.png


In [5]:
def jaccard_overlap(mats_A, mats_B, thr, label_A, label_B):
    # presence fractions
    mean_A = mats_A.mean(axis=0)
    mean_B = mats_B.mean(axis=0)
    set_A = (mean_A >= thr)
    set_B = (mean_B >= thr)
    inter = (set_A & set_B).sum()
    union = (set_A | set_B).sum()
    return inter / union if union>0 else np.nan

for thr, thrname in [(0.7, "70%"), (0.9, "90%")]:
    j_off = jaccard_overlap(mats_H, mats_OFF, thr, "H", "OFF")
    j_on  = jaccard_overlap(mats_H, mats_ON, thr, "H", "ON")
    print(f"Jaccard {thrname}: H vs OFF={j_off:.3f}, H vs ON={j_on:.3f}")


Jaccard 70%: H vs OFF=0.560, H vs ON=0.643
Jaccard 90%: H vs OFF=0.129, H vs ON=0.201
