In [9]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import entropy

# === Purpose ===============================================================
# This script computes session-level quality metrics after source estimation
# and before segment selection prior to PCA/ICA dimensionality reduction. Metrics are later used to decide which
# sessions to include in the final analysis set.
#
# According to the manuscript:
# - 12 PD (ON+OFF) and 12 CTL were retained.
# - Retention required high composite entropy and no low composite_all values.
# - For PD, BOTH ON and OFF sessions per subject must pass QC to ensure comparability.
# ===========================================================================

# === Config: paths to PCA/ICA-reduced source arrays ========================
base = Path("/lustre/majlepy2/myproject/subsets_pca_ica")
paths = sorted(base.rglob("*_pca_ica.npy"))
results = []
SCALE = 1e12  # optional rescaling to make metric magnitudes comparable

for p in paths:
    try:
        # Each .npy file shape: (labels, time, epochs)
        arr = np.load(p)
        n_labels, n_times, n_epochs = arr.shape

        # Reshape to 2D time series matrix:
        #   (labels, time, epochs) → (time, labels*epochs)
        # so that each column is one source-epoch combination.
        data = arr.transpose(1, 0, 2).reshape(n_times, -1)
        data = data * SCALE  # rescaling step (not essential, mainly numerical stability)

        # === Metrics ========================================================
        # Total variance: overall signal variance across all timepoints/channels.
        total_var = np.var(data)

        # Mean peak-to-peak: average dynamic range across all series.
        mean_p2p = np.mean(np.ptp(data, axis=0))

        # "Entropy": here implemented as Shannon entropy of absolute values.
        # (Note: scipy.stats.entropy expects probabilities; for reproducibility
        # this approximates temporal variability. In the manuscript this feeds
        # into 'composite_entropy'.)
        label_entropy = np.mean([entropy(np.abs(data[:, i])) for i in range(data.shape[1])])

        # Spatial standard deviation: mean std across channels at each timepoint.
        # Reflects how distributed activity is across sources.
        spatial_std_mean = np.mean(np.std(data, axis=1))

        # (Optional) mean pairwise correlation across channels was tested
        # but not used in final selection, so commented out.
        # corr_matrix = np.corrcoef(data.T)
        # mean_pairwise_corr = np.mean(np.abs(corr_matrix[np.triu_indices_from(corr_matrix, k=1)]))

        # Collect all metrics for this session
        results.append({
            "file": p.name,
            "path": str(p),
            "total_variance": total_var,
            "mean_peak_to_peak": mean_p2p,
            "temporal_entropy": label_entropy,
            "spatial_std": spatial_std_mean,
            # "mean_pairwise_corr": mean_pairwise_corr,
        })

        print(f"{p.name}: var={total_var:.6f}, p2p={mean_p2p:.6f}, ent={label_entropy:.3f}, std={spatial_std_mean:.6f}")
        # print(f"corr={mean_pairwise_corr:.3f}")

    except Exception as e:
        print(f"Failed on {p.name}: {e}")

# === Collect results into DataFrame ========================================
df = pd.DataFrame(results)

# Extract subject numbers from filenames (assumes sub-### convention)
df['subject_num'] = df['file'].str.extract(r'sub-(\d{3})').astype(int)

# === Normalize metrics (0–1 per column) ====================================
# This makes them comparable before averaging into composites.
for col in ['total_variance', 'temporal_entropy', 'spatial_std', 'mean_peak_to_peak']:  # , 'mean_pairwise_corr'
    df[f'norm_{col}'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

# === Define composite metrics ==============================================
# Composite_variance / _entropy / _spatial are simply normalized single metrics.
df['composite_variance'] = df['norm_total_variance']
df['composite_entropy'] = df['norm_temporal_entropy']
df['composite_spatial'] = df['norm_spatial_std']

# df['composite_corr'] = 1 - df['norm_mean_pairwise_corr']  # Lower correlation = better (not used in text)

# Composite_all: intended as the average across variance, entropy, spatial_std.
# (Here weighted equally at 0.25 each; in manuscript referred to as "composite all").
df['composite_all'] = (
    0.25 * df['norm_total_variance'] +
    0.25 * df['norm_temporal_entropy'] +
    0.25 * df['norm_spatial_std']
    # + 0.25 * (1 - df['norm_mean_pairwise_corr'])
)

# === Group split (example by subject number) ===============================
# Grouping below is a placeholder split (sub-001–031 vs sub-032–056).
# In practice, one should split by PD vs CTL and enforce the rule that both
# ON and OFF sessions of each PD participant must pass QC.
group1 = df[df['subject_num'] <= 31]
group2 = df[df['subject_num'] >= 32]

# === Print results sorted by each composite metric =========================
# NOTE: 'composite_corr' and 'mean_pairwise_corr' are commented out above,
# but still referenced here; uncomment if using correlation.
composites = [
    'composite_variance',
    'composite_entropy',
    'composite_spatial',
    'composite_corr',
    'composite_all'
]

for comp in composites:
    print(f"\n===== Sorted by {comp} =====")
    print("\nGroup 1 (sub-001 to sub-031):")
    print(group1.sort_values(comp, ascending=False)[
        ['file', comp, 'total_variance', 'temporal_entropy', 'spatial_std', 'mean_peak_to_peak']  # , 'mean_pairwise_corr'
    ].to_string(index=False))
    print("\nGroup 2 (sub-032 to sub-056):")
    print(group2.sort_values(comp, ascending=False)[
        ['file', comp, 'total_variance', 'temporal_entropy', 'spatial_std', 'mean_peak_to_peak']  # , 'mean_pairwise_corr'
    ].to_string(index=False))


sub-001_ses-01_label_tc3d.npy: var=9.181486, p2p=16.032649, ent=4.562, std=3.025936, corr=0.094
sub-002_ses-01_label_tc3d.npy: var=33.565651, p2p=32.171740, ent=4.557, std=5.776394, corr=0.100
sub-002_ses-02_label_tc3d.npy: var=6.675033, p2p=13.644007, ent=4.564, std=2.579749, corr=0.095
sub-005_ses-01_label_tc3d.npy: var=5.961667, p2p=13.519578, ent=4.556, std=2.432432, corr=0.092
sub-005_ses-02_label_tc3d.npy: var=10.696160, p2p=17.472705, ent=4.566, std=3.260211, corr=0.102
sub-006_ses-01_label_tc3d.npy: var=13.111994, p2p=19.505707, ent=4.563, std=3.614694, corr=0.105
sub-006_ses-02_label_tc3d.npy: var=5.810877, p2p=12.579188, ent=4.566, std=2.408271, corr=0.106
sub-007_ses-01_label_tc3d.npy: var=12.469138, p2p=18.823059, ent=4.562, std=3.527774, corr=0.097
sub-008_ses-01_label_tc3d.npy: var=9.535118, p2p=16.051955, ent=4.564, std=3.082986, corr=0.092
sub-008_ses-02_label_tc3d.npy: var=3.295679, p2p=9.092458, ent=4.574, std=1.812314, corr=0.088
sub-009_ses-01_label_tc3d.npy: var=9.