In [2]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal
import matplotlib.pyplot as plt
import seaborn as sns

# =========================================
# 1) File paths
#    (Modify paths if needed)
# =========================================
EEG_PATH   = r"C:\Users\LENOVO\Desktop\sen_proj\eeg_MERGED.csv"
STATS_OUT  = r"C:\Users\LENOVO\Desktop\sen_proj\eeg_FEATURE_STATS.csv"
CLEAN_OUT  = r"C:\Users\LENOVO\Desktop\sen_proj\eeg_features_SELECTED.csv"
CORR_FIG_OUT = r"C:\Users\LENOVO\Desktop\sen_proj\eeg_features_corr_heatmap.png"

# =========================================
# 2) Load EEG dataset
# =========================================
df = pd.read_csv(EEG_PATH)

print("Shape:", df.shape)
print("First columns:", df.columns[:15].tolist())

# =========================================
# 3) Define metadata and feature columns
# =========================================
meta_cols = ["subject", "run", "window_idx", "label"]
if "session" in df.columns:
    meta_cols.append("session")

meta_cols = [c for c in meta_cols if c in df.columns]

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
FEATURE_COLS = [c for c in numeric_cols if c not in meta_cols]

print(f"\nNumber of feature columns: {len(FEATURE_COLS)}")
print("Example feature names:", FEATURE_COLS[:10])

# Ensure labels are integers
df["label"] = df["label"].astype(int)

# Identify available labels in the dataset
labels = sorted(df["label"].unique().tolist())
print("\nLabels found in data:", labels)

# =========================================
# 4) Prepare data per class
# =========================================
groups_df = {lab: df[df["label"] == lab] for lab in labels}
for lab, d in groups_df.items():
    print(f"Number of samples for label={lab}: {len(d)}")

# =========================================
# 5) Kruskalâ€“Wallis test for each feature
#    (Following the approach in Raveendran et al. 2024:
#     compare distributions across classes
#     and select features with p < 0.05)
# =========================================
ALPHA = 0.05   # Significance level as in the paper

results = []

for col in FEATURE_COLS:
    # Extract feature values for each class
    group_values = [groups_df[lab][col].values for lab in labels]

    # Skip feature if any group has fewer than 2 samples
    if any(len(g) < 2 for g in group_values):
        results.append({
            "feature": col,
            "KW_H": np.nan,
            "KW_p": np.nan
        })
        continue

    try:
        # Kruskalâ€“Wallis H-test
        H, p_kw = kruskal(*group_values)
    except Exception:
        H, p_kw = np.nan, np.nan

    results.append({
        "feature": col,
        "KW_H": H,
        "KW_p": p_kw
    })

stat_df = pd.DataFrame(results)

# =========================================
# 6) Select statistically significant features (p < 0.05)
#    Following Raveendran et al. (2024)
# =========================================
def is_important(row):
    p = row["KW_p"]
    return (not pd.isna(p)) and (p < ALPHA)

stat_df["is_important"] = stat_df.apply(is_important, axis=1)

important_features = stat_df[stat_df["is_important"]]["feature"].tolist()
dropped_features   = stat_df[~stat_df["is_important"]]["feature"].tolist()

print(f"\nðŸ”¹ Number of retained features (KW p<{ALPHA}): {len(important_features)}")
print(f"ðŸ”¸ Number of dropped features: {len(dropped_features)}")

print("\nFirst 10 important features:")
print(important_features[:10])

# =========================================
# 6-b) Correlation matrix of important features
#      Shows relationships after feature selection
# =========================================
if len(important_features) > 1:
    print("\nðŸ“Š Generating correlation matrix for important features...")
    corr = df[important_features].corr()

    plt.figure(figsize=(12, 10))
    sns.heatmap(
        corr,
        cmap="coolwarm",
        center=0,
        square=False,
        cbar=True
    )
    plt.title("Correlation Matrix of Important EEG Features (Kruskal p<0.05)")
    plt.tight_layout()
    plt.savefig(CORR_FIG_OUT, dpi=300)
    plt.close()

    print("âœ… Saved correlation heatmap to:")
    print(CORR_FIG_OUT)
else:
    print("\nâš  Less than two important features â†’ correlation matrix not meaningful.")

# =========================================
# 7) Save and display statistics table
# =========================================
stat_df = stat_df.sort_values("KW_p", ascending=True)

print("\nðŸ“‹ Kruskalâ€“Wallis statistics table (first 20 rows):")
print(stat_df[["feature", "KW_H", "KW_p", "is_important"]].head(20))

stat_df.to_csv(STATS_OUT, index=False, encoding="utf-8-sig")
print("\nâœ… Saved Kruskalâ€“Wallis statistics to:")
print(STATS_OUT)

# =========================================
# 8) Create cleaned dataset with important features only
# =========================================
KEEP_COLS = meta_cols + important_features
df_clean = df[KEEP_COLS].copy()

df_clean.to_csv(CLEAN_OUT, index=False, encoding="utf-8-sig")
print("\nâœ… Saved CLEAN EEG dataset (important features only) to:")
print(CLEAN_OUT)

print("\nðŸ“Œ Shape of cleaned dataset:", df_clean.shape)
print("ðŸ“Œ First columns:")
print(df_clean.columns[:20].tolist())

# =========================================
# 9) Print dropped features
# =========================================
print("\nðŸ”¸ Dropped features (KW_p >= 0.05):")
print(f"Total dropped: {len(dropped_features)}")

# If list is long, display only the first 30
max_show = 30
for f in dropped_features[:max_show]:
    print(" -", f)

if len(dropped_features) > max_show:
    print(f"... (+ {len(dropped_features) - max_show} more dropped features)")

Shape: (5403559, 50)
First columns: ['subject', 'run', 'window_idx', 'label', 'ch1_Delta Bandpower', 'ch1_Theta Bandpower', 'ch1_Alpha Bandpower', 'ch1_Beta Bandpower', 'ch1_Gamma Bandpower', 'ch1_Relative Delta Bandpower', 'ch1_Relative Theta Bandpower', 'ch1_Relative Alpha Bandpower', 'ch1_Relative Beta Bandpower', 'ch1_Relative Gamma Bandpower', 'ch1_Interquartile Range']

Number of feature columns: 46
Example feature names: ['ch1_Delta Bandpower', 'ch1_Theta Bandpower', 'ch1_Alpha Bandpower', 'ch1_Beta Bandpower', 'ch1_Gamma Bandpower', 'ch1_Relative Delta Bandpower', 'ch1_Relative Theta Bandpower', 'ch1_Relative Alpha Bandpower', 'ch1_Relative Beta Bandpower', 'ch1_Relative Gamma Bandpower']

Labels found in data: [0, 1, 2]
Number of samples for label=0: 5061385
Number of samples for label=1: 319023
Number of samples for label=2: 23151

ðŸ”¹ Number of retained features (KW p<0.05): 44
ðŸ”¸ Number of dropped features: 2

First 10 important features:
['ch1_Delta Bandpower', 'ch1_The

In [5]:
# =========================================
# High-correlation pruning for TWO-CHANNEL EMG
#   - Only within each channel (ch1, ch2)
#   - Does NOT drop a whole channel just because they are similar
# =========================================

CORR_THRESH = 0.90  # correlation threshold

# ---- 1) Split features by channel prefix ----
ch1_prefix = "ch1_"
ch2_prefix = "ch2_"

ch1_features = [f for f in important_features if f.startswith(ch1_prefix)]
ch2_features = [f for f in important_features if f.startswith(ch2_prefix)]

# Features that are not explicitly channel-specific (e.g., global features)
shared_features = [
    f for f in important_features
    if f not in ch1_features and f not in ch2_features
]

print("\nðŸ”§ Channel-wise feature groups:")
print("Channel 1 features:", ch1_features)
print("Channel 2 features:", ch2_features)
print("Shared (non-channel) features:", shared_features)

# ---- 2) Function to prune high-correlation features within a given list ----
def prune_high_corr(feature_list, df, thresh=0.90):
    """
    Remove highly correlated features from a given list (within-channel pruning).
    Keeps one representative feature from each highly correlated group.
    """
    if len(feature_list) <= 1:
        return feature_list, []

    corr_matrix = df[feature_list].corr().abs()
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )

    to_drop = [
        col for col in upper.columns if any(upper[col] > thresh)
    ]
    kept = [f for f in feature_list if f not in to_drop]
    return kept, to_drop

# ---- 3) Apply pruning within each channel ----
ch1_kept, ch1_dropped = prune_high_corr(ch1_features, df, CORR_THRESH)
ch2_kept, ch2_dropped = prune_high_corr(ch2_features, df, CORR_THRESH)

print("\nðŸ“‰ Channel 1 (within-channel correlation pruning)")
print("  Original:", len(ch1_features))
print("  Dropped :", len(ch1_dropped))
print("  Kept    :", len(ch1_kept))
print("  Dropped features:", ch1_dropped)

print("\nðŸ“‰ Channel 2 (within-channel correlation pruning)")
print("  Original:", len(ch2_features))
print("  Dropped :", len(ch2_dropped))
print("  Kept    :", len(ch2_kept))
print("  Dropped features:", ch2_dropped)

# ---- 4) Final reduced feature set (shared + pruned ch1 + pruned ch2) ----
reduced_features_two_channel = shared_features + ch1_kept + ch2_kept

print("\nâœ… FINAL reduced feature set for two-channel EMG:")
print("Total original important features :", len(important_features))
print("Total after within-channel pruning:", len(reduced_features_two_channel))
print("Final features:", reduced_features_two_channel)

# ---- 5) (Optional) Save final CLEAN dataset based on two-channel pruning ----
CLEAN_OUT_TWO_CH = r"C:\Users\LENOVO\Desktop\sen\sen\FIN_DAT\eeg_features_FINAL_two_channel_corr.csv"

KEEP_COLS_FINAL = meta_cols + reduced_features_two_channel
df_final_two_ch = df[KEEP_COLS_FINAL].copy()
df_final_two_ch.to_csv(CLEAN_OUT_TWO_CH, index=False, encoding="utf-8-sig")

print("\nðŸ“‚ Saved FINAL two-channel CLEAN dataset to:")
print(CLEAN_OUT_TWO_CH)
print("Shape:", df_final_two_ch.shape)



ðŸ”§ Channel-wise feature groups:
Channel 1 features: ['ch1_Delta Bandpower', 'ch1_Theta Bandpower', 'ch1_Alpha Bandpower', 'ch1_Beta Bandpower', 'ch1_Gamma Bandpower', 'ch1_Relative Delta Bandpower', 'ch1_Relative Theta Bandpower', 'ch1_Relative Alpha Bandpower', 'ch1_Relative Beta Bandpower', 'ch1_Relative Gamma Bandpower', 'ch1_Interquartile Range', 'ch1_Median Absolute Deviation', 'ch1_Median', 'ch1_Variance', 'ch1_Entropy', 'ch1_Standard Deviation', 'ch1_Skewness', 'ch1_Kurtosis', 'ch1_Line Length', 'ch1_Hjorth Activity', 'ch1_Hjorth Mobility', 'ch1_Hjorth Complexity']
Channel 2 features: ['ch2_Delta Bandpower', 'ch2_Theta Bandpower', 'ch2_Alpha Bandpower', 'ch2_Beta Bandpower', 'ch2_Gamma Bandpower', 'ch2_Relative Delta Bandpower', 'ch2_Relative Theta Bandpower', 'ch2_Relative Alpha Bandpower', 'ch2_Relative Beta Bandpower', 'ch2_Relative Gamma Bandpower', 'ch2_Interquartile Range', 'ch2_Median Absolute Deviation', 'ch2_Median', 'ch2_Variance', 'ch2_Entropy', 'ch2_Standard Devia