In [8]:
#!/usr/bin/env python3
"""
Pipeline: EEG → 9-level DWT → per-minute segments → 10 s sub-windows (50% overlap) → dPTE  
Per subject, saves NPZ: sub-{ID}_PTE_{group}.npz  
with:
  pte_data: ndarray of shape (n_minutes, n_subwins, n_bands, n_ch, n_ch)
  subject_id: int
"""

import os
import glob
import numpy as np
import mne
import pywt
from scipy.signal import hilbert
import numba as nb

# ───────────── CONFIGURATION ─────────────────────────────────────────────────
MAX_LVL   = 8
WAVELET   = 'db4'
band2levels = {
    'delta': [1,2,3],   # D8–D6 → 0.5–4 Hz
    'theta': [4],       # D5 → 4–8 Hz
    'alpha': [5],       # D4 → 8–16 Hz
    'beta':  [6],       # D3 → 16–32 Hz
    'gamma': [7]        # D2 → 32–64 Hz
}
band_list      = list(band2levels.keys())
MINUTE_LEN     = 60     # seconds per “minute” segment
SUBWIN_LEN     = 10     # seconds per sub-window
OVERLAP        = 0.5    # 50% overlap
DATA_DIR  = "/home/s.dharia-ra/Shyamal/EEG_Phase_Project/major-revisions/dataset"
OUT_DIR   = "/home/s.dharia-ra/Shyamal/EEG_Phase_Project/major-revisions/features"

# ───────────── HELPERS ────────────────────────────────────────────────────────
def get_subject_id(fp: str) -> int:
    for part in fp.split(os.sep):
        if part.startswith('sub-'):
            return int(part.replace('sub-','').strip())
    return None

@nb.njit(fastmath=True, cache=True)
def _entropy(counts, length):
    H = 0.0
    for c in counts:
        if c > 0:
            p = c/length
            H -= p * np.log2(p)
    return H

@nb.njit(fastmath=True, cache=True)
def compute_PTE_numba(phase, delay):
    m,n = phase.shape
    raw = np.zeros((m,m), np.float64)
    L   = n - delay
    for i in range(m):
        x = phase[i,:L]
        for j in range(m):
            y   = phase[j,:L]
            ypr = phase[j,delay:]
            vmax = int(max(x.max(), y.max(), ypr.max()) + 1)
            cnt_y     = np.bincount(y, minlength=vmax)
            idx_ypr_y = ypr + vmax*y
            cnt_ypr_y = np.bincount(idx_ypr_y, minlength=vmax*vmax)
            idx_y_x   = y + vmax*x
            cnt_y_x   = np.bincount(idx_y_x, minlength=vmax*vmax)
            idx_3d    = ypr + vmax*(y + vmax*x)
            cnt_3d    = np.bincount(idx_3d, minlength=vmax*vmax*vmax)
            Hy   = _entropy(cnt_y,   L)
            Hypr = _entropy(cnt_ypr_y,L)
            Hyx  = _entropy(cnt_y_x, L)
            Hyprx= _entropy(cnt_3d,  L)
            raw[i,j] = Hypr + Hyx - Hy - Hyprx
    return raw

@nb.njit(fastmath=True, cache=True)
def dPTE_from_raw(raw):
    sym = raw + raw.T
    return np.triu(raw/sym,1) + np.tril((raw/sym).T,-1)

def reconstruct_band_dwt(data, levels):
    coeffs = pywt.wavedec(data, WAVELET, axis=1, level=MAX_LVL)
    kept   = [np.zeros_like(c) for c in coeffs]
    for lv in levels:
        kept[lv] = coeffs[lv]
    return pywt.waverec(kept, WAVELET, axis=1)

def get_delay(phase):
    m,n = phase.shape
    c1  = m*n
    c2  = (phase * np.roll(phase,1,axis=1) < 0).sum()
    return int(round(c1 / c2))

def get_binsize(phase, c=3.49):
    m,n = phase.shape
    return c * np.mean(np.std(phase,axis=1,ddof=1)) * n**(-1/3)

def discretize_phase(phase, binsz):
    return np.ceil(phase / binsz).astype(np.int32)

# ───────────── PROCESS ONE SUBJECT ────────────────────────────────────────────
def process_one_subject(fp):
    raw = mne.io.read_raw_eeglab(fp, preload=True, verbose='ERROR')
    raw.resample(256)
    fs       = raw.info['sfreq']
    data_full= raw.get_data()           # shape (n_ch, total_samples)
    n_ch     = data_full.shape[0]
    total_samples = data_full.shape[1]

    minute_samples = int(MINUTE_LEN * fs)
    subwin_samples = int(SUBWIN_LEN  * fs)
    step           = int(subwin_samples * (1-OVERLAP))

    # how many full 60 s segments
    n_minutes = total_samples // minute_samples
    n_bands   = len(band_list)
    n_subwins = (minute_samples - subwin_samples)//step + 1  # =11

    # allocate: (n_minutes, n_subwins, n_bands, n_ch, n_ch)
    dp_subject = np.zeros((n_minutes, n_subwins, n_bands, n_ch, n_ch), dtype=np.float64)

    for mi in range(n_minutes):
        seg60 = data_full[:, mi*minute_samples : (mi+1)*minute_samples]
        for bi, band in enumerate(band_list):
            levels = band2levels[band]
            band_data = reconstruct_band_dwt(seg60, levels)
            phase     = np.angle(hilbert(band_data, axis=1))
            delay     = get_delay(phase)
            binsz     = get_binsize(phase)
            dph       = discretize_phase(phase + np.pi, binsz)

            for wi, start in enumerate(range(0, minute_samples - subwin_samples+1, step)):
                blk  = dph[:, start:start+subwin_samples]
                rawP = compute_PTE_numba(blk, delay)
                dp   = dPTE_from_raw(rawP)
                dp_subject[mi, wi, bi, :, :] = dp

    subj_id = get_subject_id(fp)
    return subj_id, dp_subject

# ───────────── MAIN & SAVE ───────────────────────────────────────────────────
if __name__ == '__main__':
    os.makedirs(OUT_DIR, exist_ok=True)

    all_paths = glob.glob(f"{DATA_DIR}/sub-*/eeg/*.set")
    groups    = {'alz':[], 'ctrl':[], 'ftd':[]}
    for fp in all_paths:
        sid = get_subject_id(fp)
        if sid <= 36: groups['alz'].append(fp)
        elif sid <= 65: groups['ctrl'].append(fp)
        else: groups['ftd'].append(fp)

    for grp, paths in groups.items():
        for fp in paths:
            subj_id, dp = process_one_subject(fp)
            out_f = os.path.join(OUT_DIR, f"sub-{subj_id}_PTE_{grp}.npz")
            print(dp.shape)
            np.savez(out_f, pte_data=dp, subject_id=subj_id)
            print(f"Saved {out_f}  shape={dp.shape}")


joblib supports memapping pool but "MNE_CACHE_DIR" is not set in MNE-Python config. To enable it, use, e.g., mne.set_cache_dir('/tmp/shm'). This will store temporary files under /dev/shm and can result in large memory savings.
(13, 11, 5, 19, 19)
Saved /home/s.dharia-ra/Shyamal/EEG_Phase_Project/major-revisions/features/sub-2_PTE_alz.npz  shape=(13, 11, 5, 19, 19)
joblib supports memapping pool but "MNE_CACHE_DIR" is not set in MNE-Python config. To enable it, use, e.g., mne.set_cache_dir('/tmp/shm'). This will store temporary files under /dev/shm and can result in large memory savings.
(9, 11, 5, 19, 19)
Saved /home/s.dharia-ra/Shyamal/EEG_Phase_Project/major-revisions/features/sub-30_PTE_alz.npz  shape=(9, 11, 5, 19, 19)
joblib supports memapping pool but "MNE_CACHE_DIR" is not set in MNE-Python config. To enable it, use, e.g., mne.set_cache_dir('/tmp/shm'). This will store temporary files under /dev/shm and can result in large memory savings.
(13, 11, 5, 19, 19)
Saved /home/s.dharia

In [9]:
import glob
file_paths = glob.glob('./dataset/sub-*/eeg/*.set')

def get_subject_id(filepath):
    for part in filepath.split(os.sep):
        if 'sub-' in part:
            return int(part.replace('sub-', '').strip())
    return None

# Separate into groups
alz_file_paths = []
ctrl_file_paths = []
ftd_file_paths = []

for fpath in file_paths:
    subj_id = get_subject_id(fpath)
    if subj_id is None:
        continue
    if 1 <= subj_id <= 36:
        alz_file_paths.append(fpath)
    elif 37 <= subj_id <= 65:
        ctrl_file_paths.append(fpath)
    elif subj_id >= 66:
        ftd_file_paths.append(fpath)

In [5]:
#!/usr/bin/env python3
"""
Per‐subject DE extraction via DWT bands, 60 s windows, 50 % overlap over entire recording.
Saves sub-{ID}_DE_{label}.npz with:
  DE_features: ndarray (n_windows, n_channels, n_bands)
  label:       int

Also prints per file:
  • number of windows (1-minute segments) extracted
  • number of channels
  • number of bands (features)
  • shape of the DE_features array
"""

import os
import numpy as np
import mne
import pywt
import math

# ───────────── CONFIG ─────────────────────────────────────────────────────────
MAX_LVL   = 8
WAVELET   = 'db4'
band2levels = {
    'delta': [1,2,3],   # ~0.5–4 Hz
    'theta': [4],       # ~4–8 Hz
    'alpha': [5],       # ~8–12 Hz
    'beta':  [6],       # ~12–30 Hz
    'gamma': [7]        # ~30–45 Hz
}
band_list = list(band2levels.keys())

WIN_LEN  = 60    # window length in seconds
OVERLAP  = 0.0   # 50% overlap
SFREQ    = 256   # target sampling rate (Hz)

# ───────────── HELPERS ────────────────────────────────────────────────────────
def get_subject_id(fp: str) -> int:
    """Extract integer subject ID from a path segment 'sub-##'."""
    for part in fp.split(os.sep):
        if part.startswith('sub-'):
            return int(part.replace('sub-','').strip())
    return None

def reconstruct_band_dwt(data: np.ndarray, levels: list[int]) -> np.ndarray:
    """Perform MAX_LVL‐level DWT and reconstruct only the detail levels in `levels`."""
    coeffs = pywt.wavedec(data, WAVELET, axis=1, level=MAX_LVL)
    kept   = [np.zeros_like(c) for c in coeffs]
    for lv in levels:
        kept[lv] = coeffs[lv]
    return pywt.waverec(kept, WAVELET, axis=1)

def compute_DE(signal: np.ndarray) -> float:
    """Differential entropy: 0.5 * ln(2πe σ²)."""
    var = np.var(signal, ddof=1)
    return 0.5 * math.log(2 * math.pi * math.e * var)

# ───────────── MAIN ───────────────────────────────────────────────────────────
def compute_DE_sliding(files_paths, output_dir, label):
    label_map = {'ctrl':0, 'alz':1, 'ftd':1}
    lbl = label_map.get(label, -1)
    os.makedirs(output_dir, exist_ok=True)

    for fp in files_paths:
        print(f"\n→ Processing {fp}")
        raw = mne.io.read_raw_eeglab(fp, preload=True, verbose='ERROR')
        raw.resample(SFREQ)

        # use full recording
        data = raw.get_data() * 1e6  # (n_ch, n_samples) in µV
        fs, n_ch = int(raw.info['sfreq']), data.shape[0]
        n_samp = data.shape[1]

        # compute window/step in samples
        win  = int(WIN_LEN * fs)
        step = int(win * (1 - OVERLAP))
        # number of windows across full recording
        n_wins = (n_samp - win) // step + 1

        print(f"Number of 60s windows: {n_wins}")
        print(f"Number of channels:    {n_ch}")
        print(f"Number of bands:       {len(band_list)}")

        DE_values = np.zeros((n_wins, n_ch, len(band_list)), dtype=float)

        # sliding windows
        for wi, start in enumerate(range(0, n_samp - win + 1, step)):
            seg = data[:, start:start+win]  # (n_ch, win)
            # band‐specific reconstruction
            band_sigs = {
                band: reconstruct_band_dwt(seg, band2levels[band])
                for band in band_list
            }
            # compute DE per channel & band
            for bi, band in enumerate(band_list):
                sig = band_sigs[band]
                for ch in range(n_ch):
                    DE_values[wi, ch, bi] = compute_DE(sig[ch])

        subj_id = get_subject_id(fp)
        out_file = os.path.join(output_dir, f"sub-{subj_id}_DE_{label}.npz")
        print("DE_values shape:", DE_values.shape)
        np.savez_compressed(out_file,
                            DE_features=DE_values,
                            label=lbl)

        print(f"✔ Saved: {out_file}")
        print(f"  DE_features shape: {DE_values.shape}")

# ───────────── USAGE EXAMPLE ─────────────────────────────────────────────────
if __name__ == '__main__':
    # lists of EEG .set file paths per group

    OUT_DIR = "./DE_results"

    compute_DE_sliding(ctrl_file_paths, OUT_DIR, 'ctrl')
    compute_DE_sliding(alz_file_paths,  OUT_DIR, 'alz')
    compute_DE_sliding(ftd_file_paths,  OUT_DIR, 'ftd')



→ Processing ./dataset/sub-037/eeg/sub-037_task-eyesclosed_eeg.set
joblib supports memapping pool but "MNE_CACHE_DIR" is not set in MNE-Python config. To enable it, use, e.g., mne.set_cache_dir('/tmp/shm'). This will store temporary files under /dev/shm and can result in large memory savings.
Number of 60s windows: 12
Number of channels:    19
Number of bands:       5
DE_values shape: (12, 19, 5)
✔ Saved: ./DE_results/sub-37_DE_ctrl.npz
  DE_features shape: (12, 19, 5)

→ Processing ./dataset/sub-044/eeg/sub-044_task-eyesclosed_eeg.set
joblib supports memapping pool but "MNE_CACHE_DIR" is not set in MNE-Python config. To enable it, use, e.g., mne.set_cache_dir('/tmp/shm'). This will store temporary files under /dev/shm and can result in large memory savings.
Number of 60s windows: 14
Number of channels:    19
Number of bands:       5
DE_values shape: (14, 19, 5)
✔ Saved: ./DE_results/sub-44_DE_ctrl.npz
  DE_features shape: (14, 19, 5)

→ Processing ./dataset/sub-049/eeg/sub-049_task-e