# Discrete Wavelet Transform (DWT)

Discrete Wavelet Transform (DWT) was applied to extract time-frequency features across delta, theta, alpha, and beta bands. Both FFT and DWT features were stored for each subject.

In [None]:
import mne
import numpy as np
import pandas as pd
import os
import pywt

BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
SUBJ = "sub-001"
SUBJ_DIR = os.path.join(BASE_DIR, SUBJ)
DATA_DIR = os.path.join(SUBJ_DIR, "data")
EPO_FILE = os.path.join(SUBJ_DIR, "epo_001_raw.fif")

epochs = mne.read_epochs(EPO_FILE, preload=True)
sfreq = epochs.info["sfreq"]
ch_names = epochs.ch_names
data = epochs.get_data()

WAVELET = 'db4'
LEVEL = 6 

# Mapping of Detail Coefficients (D) to Physiological Bands (approx for fs=250)
# --- DWT per epoch/channel ---
bands_dwt = {
    "delta": 6,   # A6 approx 0.5–4 Hz
    "theta": 5,   # D5 approx 4–8 Hz
    "alpha": 4,   # D4 approx 8–16 Hz
    "beta": 3     # D3 approx 16–32 Hz
}
# Container for results
rows_dwt = []

for e_idx, epoch in enumerate(data):
    for c_idx, ch_data in enumerate(epoch):
        # Compute wavelet decomposition
        coeffs = pywt.wavedec(ch_data, wavelet=WAVELET, level=LEVEL)
        # coeffs[0] is the approximation at level 6 (lowest freq), coeffs[1] = D6, etc.
        entry = {'epoch': e_idx, 'channel': ch_names[c_idx]}
        for band, level_idx in bands_dwt.items():
            if band == "delta":
                # Approximation coefficients A6
                coeff = coeffs[0]
            else:
                # Detail coefficients D3–D5
                coeff = coeffs[LEVEL - level_idx + 1]
            # Feature: energy of coefficients
            energy = np.sum(np.square(coeff))
            entry[f"{band}_energy"] = energy
        rows_dwt.append(entry)

# Save to CSV
os.makedirs(DATA_DIR, exist_ok=True)
df_dwt = pd.DataFrame(rows_dwt)
df_dwt.to_csv(os.path.join(DATA_DIR, f"{SUBJ}_DWT_band_energy.csv"), index=False)

print(f"DWT features saved: ‼️{os.path.join(DATA_DIR, f'{SUBJ}_DWT_band_energy.csv')}")




In [None]:
import mne
import numpy as np
import pandas as pd
import os
import pywt

BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"

WAVELET = 'db4'
LEVEL = 6  # decomposition level

# Explicit mapping of EEG bands to DWT coefficients
# A6 = coeffs[0], D6 = coeffs[1], D5 = coeffs[2], D4 = coeffs[3], D3 = coeffs[4]
coeff_indices = {
    "delta": [0, 1],  # A6 + D6
    "theta": [2],     # D5
    "alpha": [3],     # D4
    "beta": [4]       # D3
}

for subj_num in range(1, 150):  # subjects 001–045
    SUBJ = f"sub-{subj_num:03d}"
    SUBJ_DIR = os.path.join(BASE_DIR, SUBJ)
    DATA_DIR = os.path.join(SUBJ_DIR, "data")
    EPO_FILE = os.path.join(SUBJ_DIR, f"epo_{subj_num:03d}_raw.fif")

    if not os.path.exists(EPO_FILE):
        print(f"File not found: {EPO_FILE}, skipping.")
        continue

    print(f"Processing {SUBJ}...")

    epochs = mne.read_epochs(EPO_FILE, preload=True)
    ch_names = epochs.ch_names
    data = epochs.get_data()  # shape: (n_epochs, n_channels, n_times)

    rows_dwt = []

    for e_idx, epoch in enumerate(data):
        for c_idx, ch_data in enumerate(epoch):
            coeffs = pywt.wavedec(ch_data, wavelet=WAVELET, level=LEVEL)
            entry = {'epoch': e_idx, 'channel': ch_names[c_idx]}
            for band, inds in coeff_indices.items():
                # Sum energy across all coefficients assigned to the band
                energy = sum(np.sum(np.square(coeffs[i])) for i in inds)
                entry[f"{band}_energy"] = energy
            rows_dwt.append(entry)

    os.makedirs(DATA_DIR, exist_ok=True)
    df_dwt = pd.DataFrame(rows_dwt)
    out_file = os.path.join(DATA_DIR, f"{SUBJ}_DWT_band_energy.csv")
    df_dwt.to_csv(out_file, index=False)
    print(f"DWT features saved: {out_file}")


FT SELCTION

In [2]:
import pandas as pd
import os
import numpy as np

BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
BANDS = ["delta", "theta", "alpha", "beta"]

all_rows = []

for subj_num in range(1, 36):
    SUBJ = f"sub-{subj_num:03d}"
    DATA_DIR = os.path.join(BASE_DIR, SUBJ, "data")
    dwt_file = os.path.join(DATA_DIR, f"{SUBJ}_DWT_band_energy.csv")

    if not os.path.exists(dwt_file):
        continue

    df = pd.read_csv(dwt_file)

    subj_row = {"subject": SUBJ}

    for band in BANDS:
        vals = df[f"{band}_energy"].values
        subj_row[f"{band}_energy_mean"] = np.mean(vals)
        subj_row[f"{band}_energy_std"] = np.std(vals)
        subj_row[f"{band}_energy_cv"] = np.std(vals) / np.mean(vals)

    # Cognitive ratio features
    subj_row["theta_alpha_ratio"] = (
        subj_row["theta_energy_mean"] / subj_row["alpha_energy_mean"]
    )

    all_rows.append(subj_row)

df_dwt_features = pd.DataFrame(all_rows)
df_dwt_features.to_csv("DWT_subject_features.csv", index=False)
print('yes')

yes


In [7]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression

# -------------------------------
# Paths
# -------------------------------
BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
DWT_FILE = "DWT_subject_features.csv"
PARTICIPANTS_FILE = f"{BASE_DIR}\\participants.csv"

MOCA_THRESHOLD = 26


In [8]:
# -------------------------------
# Load participants (MOCA lives here)
# -------------------------------
df_part = pd.read_csv(PARTICIPANTS_FILE, dtype={"participant_id": str})
df_part.rename(columns={"participant_id": "subject"}, inplace=True)

# flexible MOCA column
moca_col = "MOCA" if "MOCA" in df_part.columns else "MoCA"


In [9]:
# -------------------------------
# Merge
# -------------------------------
df = df_feat.merge(
    df_part[["subject", moca_col]],
    on="subject",
    how="inner"
)
df["impaired"] = (df[moca_col] < MOCA_THRESHOLD).astype(int)
print(df[[moca_col, "impaired"]].describe())
print(df["impaired"].value_counts())
drop_cols = ["subject", moca_col, "impaired"]
X = df.drop(columns=drop_cols)
y = df["impaired"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

anova = SelectKBest(score_func=f_classif, k="all")
anova.fit(X_scaled, y)

anova_df = pd.DataFrame({
    "feature": X.columns,
    "F": anova.scores_,
    "p": anova.pvalues_
}).sort_values("p")

print(anova_df.head(10))
clf = LogisticRegression(
    solver="liblinear",
    max_iter=1000
)

rfe = RFE(
    estimator=clf,
    n_features_to_select=5
)

rfe.fit(X_scaled, y)

rfe_df = pd.DataFrame({
    "feature": X.columns,
    "selected": rfe.support_,
    "rank": rfe.ranking_
}).sort_values("rank")

print(rfe_df)
anova_keep = set(anova_df[anova_df["p"] < 0.05]["feature"])
rfe_keep = set(rfe_df[rfe_df["rank"] == 1]["feature"])

final_features = list(anova_keep & rfe_keep)
print("FINAL FEATURES:", final_features)



NameError: name 'df_feat' is not defined

In [10]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression

# ============================
# CONFIG
# ============================
BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
DWT_FEATURE_FILE = "DWT_subject_features.csv"
PARTICIPANTS_FILE = os.path.join(BASE_DIR, "participants.csv")
MOCA_THRESHOLD = 26
K_BEST = 10

# ============================
# LOAD FEATURES
# ============================
df_feat = pd.read_csv(DWT_FEATURE_FILE)

# Expected: subject, delta_energy_mean, theta_energy_mean, ...
assert "subject" in df_feat.columns, "subject column missing in DWT features"

# ============================
# LOAD PARTICIPANTS (MOCA)
# ============================
df_part = pd.read_csv(PARTICIPANTS_FILE, dtype={"participant_id": str})
df_part.rename(columns={"participant_id": "subject"}, inplace=True)

moca_col = "MOCA" if "MOCA" in df_part.columns else "MoCA"
assert moca_col in df_part.columns, "MOCA column missing"

# ============================
# MERGE
# ============================
df = df_feat.merge(
    df_part[["subject", moca_col]],
    on="subject",
    how="inner"
)

# ============================
# LABEL
# ============================
df["impaired"] = (df[moca_col] < MOCA_THRESHOLD).astype(int)

# ============================
# X / y
# ============================
drop_cols = ["subject", moca_col, "impaired"]
X = df.drop(columns=drop_cols)
y = df["impaired"]

# ============================
# STANDARDIZE
# ============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ============================
# UNIVARIATE ANOVA
# ============================
anova = SelectKBest(score_func=f_classif, k=min(K_BEST, X.shape[1]))
X_anova = anova.fit_transform(X_scaled, y)

anova_features = X.columns[anova.get_support()]
anova_scores = anova.scores_[anova.get_support()]

anova_df = pd.DataFrame({
    "feature": anova_features,
    "f_score": anova_scores
}).sort_values("f_score", ascending=False)

print("\n=== ANOVA SELECTED FEATURES ===")
print(anova_df)

# ============================
# RFE (LOGISTIC REGRESSION)
# ============================
logreg = LogisticRegression(max_iter=5000, solver="liblinear")

rfe = RFE(
    estimator=logreg,
    n_features_to_select=min(K_BEST, X.shape[1])
)
rfe.fit(X_scaled, y)

rfe_features = X.columns[rfe.support_]

print("\n=== RFE SELECTED FEATURES ===")
for f in rfe_features:
    print(f)

# ============================
# QUICK SANITY CHECK
# ============================
print("\nClass balance:")
print(y.value_counts())

print("\nFinal feature matrix shape:", X_scaled.shape)



=== ANOVA SELECTED FEATURES ===
             feature   f_score
9  theta_alpha_ratio  4.149237
3    theta_energy_cv  4.045870
6    alpha_energy_cv  3.513356
1    delta_energy_cv  1.641832
0   delta_energy_std  1.529994
8     beta_energy_cv  1.278908
5   alpha_energy_std  1.144788
7   beta_energy_mean  0.726632
4  alpha_energy_mean  0.634660
2  theta_energy_mean  0.067394

=== RFE SELECTED FEATURES ===
delta_energy_mean
delta_energy_std
delta_energy_cv
theta_energy_std
theta_energy_cv
alpha_energy_cv
beta_energy_mean
beta_energy_std
beta_energy_cv
theta_alpha_ratio

Class balance:
impaired
1    22
0    13
Name: count, dtype: int64

Final feature matrix shape: (35, 13)


In [1]:
import os
import pandas as pd
import numpy as np

BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
SUBJ_RANGE = range(1, 150)  # adjust to your subjects

# DWT coefficients mapping used previously
coeff_indices = {
    "delta": [0, 1],
    "theta": [2],
    "alpha": [3],
    "beta": [4]
}

rows_features = []

for subj_num in SUBJ_RANGE:
    SUBJ = f"sub-{subj_num:03d}"
    DATA_DIR = os.path.join(BASE_DIR, SUBJ, "data")
    DWT_FILE = os.path.join(DATA_DIR, f"{SUBJ}_DWT_band_energy.csv")

    if not os.path.exists(DWT_FILE):
        print(f"{SUBJ}: DWT file not found, skipping.")
        continue

    df = pd.read_csv(DWT_FILE)

    # -------- Feature 1: Theta Energy Variability (executive dysfunction proxy) --------
    theta_cols = [col for col in df.columns if 'theta_energy' in col]
    theta_energy_vals = df[theta_cols].values.flatten()
    theta_energy_var = np.std(theta_energy_vals) / (np.mean(theta_energy_vals) + 1e-12)

    # -------- Feature 2: Theta/Alpha Temporal Dominance Fraction (cognitive slowing) --------
    theta_abs = theta_energy_vals
    alpha_cols = [col for col in df.columns if 'alpha_energy' in col]
    alpha_abs = df[alpha_cols].values.flatten()
    theta_over_alpha_frac = np.mean(theta_abs > alpha_abs)

    rows_features.append({
        "subject": SUBJ,
        "theta_energy_var": theta_energy_var,
        "theta_over_alpha_frac": theta_over_alpha_frac
    })

df_features = pd.DataFrame(rows_features)
OUT_FILE = os.path.join(BASE_DIR, "PD_DWT_test_features.csv")
df_features.to_csv(OUT_FILE, index=False)
print(f"Features computed and saved: {OUT_FILE}")


Features computed and saved: C:\Users\User\Documents\EEG_Project\rEEG\PD_DWT_test_features.csv


In [1]:
import pandas as pd
from scipy.stats import ttest_ind, spearmanr

BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
FEATURE_FILE = f"{BASE_DIR}/PD_DWT_test_features.csv"
META_FILE = f"{BASE_DIR}/participants.csv"

# ------------------------
# Load feature and metadata
# ------------------------
df_feat = pd.read_csv(FEATURE_FILE)
meta = pd.read_csv(META_FILE, dtype={"participant_id": str})
meta = meta[meta["GROUP"].str.upper() == "PD"]

# Merge features with MoCA
moca_col = "MOCA" if "MOCA" in meta.columns else "MoCA"
df = df_feat.merge(meta[["participant_id", moca_col]], left_on="subject", right_on="participant_id", how="inner")

# Create impaired vs unimpaired
df["impaired"] = df[moca_col] < 26

# ------------------------
# Define features to test
# ------------------------
features = ["theta_energy_var", "theta_over_alpha_frac"]

for feat in features:
    # Group comparison
    impaired_vals = df.loc[df["impaired"], feat]
    unimpaired_vals = df.loc[~df["impaired"], feat]
    t_stat, p_group = ttest_ind(impaired_vals, unimpaired_vals, nan_policy='omit')

    # Correlation with MoCA
    rho, p_corr = spearmanr(df[feat], df[moca_col], nan_policy='omit')

    print(f"\n=== Feature: {feat} ===")
    print(f"Group comparison (Impaired vs Unimpaired): t={t_stat:.3f}, p={p_group:.4g}")
    print(f"Spearman correlation with MoCA: rho={rho:.3f}, p={p_corr:.4g}")
    from statsmodels.stats.multitest import fdrcorrection

    # Apply FDR per feature
    p_vals = [p_group, p_corr]  # t-test p and Spearman p for this feature
    rej, p_fdr = fdrcorrection(p_vals, alpha=0.05, method='indep')
    p_group_fdr, p_corr_fdr = p_fdr

    print(f"FDR-corrected p-values for feature '{feat}':")
    print(f"  Group comparison FDR p = {p_group_fdr:.6g}")
    print(f"  Spearman correlation FDR p = {p_corr_fdr:.6g}")


=== Feature: theta_energy_var ===
Group comparison (Impaired vs Unimpaired): t=-2.093, p=0.03889
Spearman correlation with MoCA: rho=0.172, p=0.08798
FDR-corrected p-values for feature 'theta_energy_var':
  Group comparison FDR p = 0.077778
  Spearman correlation FDR p = 0.0879761

=== Feature: theta_over_alpha_frac ===
Group comparison (Impaired vs Unimpaired): t=3.124, p=0.002345
Spearman correlation with MoCA: rho=-0.302, p=0.002248
FDR-corrected p-values for feature 'theta_over_alpha_frac':
  Group comparison FDR p = 0.00234491
  Spearman correlation FDR p = 0.00234491


In [1]:
import os
import pandas as pd

# -------------------------------
# Configuration
# -------------------------------
BASE_DIR = r"C:\Users\User\Documents\EEG_Project\rEEG"
INPUT_MATRIX = "ML_Feature_Matrix.csv"
DWT_FILE = os.path.join(BASE_DIR, "PD_DWT_test_features.csv")
DWT_FEATURES = ["theta_energy_var", "theta_over_alpha_frac"]

# -------------------------------
# Update ML Matrix
# -------------------------------
if not os.path.exists(INPUT_MATRIX) or not os.path.exists(DWT_FILE):
    print("Error: Required files missing.")
else:
    master_df = pd.read_csv(INPUT_MATRIX)
    dwt_df = pd.read_csv(DWT_FILE)

    # Standardize IDs for merging
    master_df['participant_id'] = master_df['participant_id'].astype(str).str.strip()
    
    # Auto-detect ID column in DWT file (checking 'subject' or 'participant_id')
    dwt_id_col = 'subject' if 'subject' in dwt_df.columns else 'participant_id'
    dwt_df['participant_id'] = dwt_df[dwt_id_col].astype(str).str.strip()

    # Select only necessary columns
    cols_to_merge = ['participant_id'] + DWT_FEATURES
    dwt_subset = dwt_df[cols_to_merge]

    # Merge
    master_df = master_df.merge(dwt_subset, on='participant_id', how='left')

    # Rename for Feature convention
    rename_dict = {f: f"Feature_DWT_{f.capitalize()}" for f in DWT_FEATURES}
    master_df.rename(columns=rename_dict, inplace=True)

    # Save
    master_df.to_csv(INPUT_MATRIX, index=False)
    print(f"Successfully added DWT features: {list(rename_dict.values())}")

Successfully added DWT features: ['Feature_DWT_Theta_energy_var', 'Feature_DWT_Theta_over_alpha_frac']
