# EEG Machine Learning Data Prep Pipeline

## TO DO - To Review

Projects:
- Pickle: https://www.perplexity.ai/search/in-a-jupyter-notebook-i-have-c-0LbAAH9ITFGfcPYaWlrt6Q

## Dependencies

General dependencies:
- python = 3.11.13
- numpy = 2.0.2
- scipy = 1.15.3
- pandas = 2.2.3
- matplotlib = 3.10.3

ML dependencies:
- scikit-learn = 1.6.1

# Imports & Utilities

In [1]:
# General imports
import os
import gc
from datetime import datetime
from pprint import pprint
import time
import pickle

import math
import numpy as np
import pandas as pd

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay


In [2]:
# Utility function to establish relative paths for a given folder
def get_folder_path(folder_name, data_folder='Data'):
    project_root = os.path.dirname(os.getcwd())
    folder_path = os.path.join(project_root, data_folder, folder_name)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Directory not found: {folder_path}')  
    return folder_path

# Utility function to create a new folder path, if not exists
def make_folder_path(folder_name, data_folder='Data', exists_ok=True):
    project_root = os.path.dirname(os.getcwd())
    folder_path = os.path.join(project_root, data_folder, folder_name)
    if os.path.exists(folder_path):
        if not exists_ok:
            raise FileExistsError(f"Directory already exists: {folder_path}")
    else:
        os.makedirs(folder_path)
    return folder_path

 # Utility function to extend an existing folder path with a subfolder
def extend_folder_path(base_folder, subfolder, exists_ok=True):
    if not os.path.isdir(base_folder):
        raise FileNotFoundError(f'Parent directory not found: {base_folder}')
    extended_path = os.path.join(base_folder, subfolder)
    if os.path.exists(extended_path):
        if not exists_ok:
            raise FileExistsError(f"Directory already exists: {extended_path}")
    else:
        os.makedirs(extended_path)
    return extended_path

# Utility function to check for the existence of a file in a given folder
def get_file_path(folder, file_name):
    file_path = os.path.join(folder, file_name)
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f'File not found: {file_path}')
    return file_path


# Classes & Functions

# Setup & Features Load

# Run Setup

In [5]:
# ML Data Prep Pipeline Run Define & Setup
#

# -----------------------------------------------------------------------
# Study Details
study_name = 'IOWA_Rest'
dataset_ref = 'ds004584-1.0.0'
# eeg_run_id = '20250619_no_preprocess'
eeg_run_folder = 'EEG_Processing_ds004584-1.0.0_20250619_no_preprocess'
# study_name = 'UNM_Oddball'
# dataset_ref = 'ds003490-1.1.0'
# eeg_run_id = '20250618'

# Run/Test Mode
test_mode = False

# Execution Parameters
run_summary = 'full_run'
ml_params = {'models': 'none'
            }
# -----------------------------------------------------------------------

# Get existing study details, if exists
study_folder_path = get_folder_path('Study_' + study_name)
study_info_df = pd.read_pickle(study_folder_path + '/study_inf_df.pkl', compression='zip')
study_subjects_df = pd.read_pickle(study_folder_path + '/study_subjects_df.pkl', compression='zip')

# Get all folder paths from study_info_df
eeg_processing_results_path = study_info_df.loc[0, 'eeg_processing_results_path']
ml_training_results_path = study_info_df.loc[0, 'ml_training_results_path']

# Get EEG results folder
eeg_results_run_path = os.path.join(eeg_processing_results_path, eeg_run_folder)
if not os.path.isdir(eeg_results_run_path):
    raise FileNotFoundError(f'Directory not found: {eeg_results_run_path}')

# Establish a new ML Training Run
current_date = datetime.now().strftime('%Y%m%d')
ml_run_id = f'ML_Training_{dataset_ref}_{current_date}_{run_summary}'
ml_training_run_path = extend_folder_path(ml_training_results_path, ml_run_id, exists_ok=False)

# Create run df and save
ml_run_params_df = pd.DataFrame({
    'ml_run_id': [ml_run_id],
    'study_name': [study_name],
    'dataset_ref': [dataset_ref],
    'ml_params': [ml_params]
})
ml_run_params_df.to_pickle(ml_training_run_path + '/ml_run_params_df.pkl', compression='zip')

# Set progress messages, testing
if test_mode:
    VERBOSE = True
else:
    VERBOSE = False

del current_date, eeg_processing_results_path, eeg_run_folder, ml_training_results_path

# Features Load & Cleaning

In [6]:
# Execute the Features Prep Pipeline
#

# Get features superset created after EEG processing
features_superset_df = pd.read_pickle(eeg_results_run_path + '/eeg_results_features_superset_df.pkl', compression='zip')


# Features Inspection, Cleaning & Reduction

In [None]:
study_features_cleaned_df = eeg_results_features_superset_df.copy()

# Drop some channels
# TODO: Should this be in the EEG pipeline so that uniform features are produced?
# Drop all columns with channel number greater than 63 .... ?? or delete row 63 as perhaps an error
# TODO: This actually reduces the AUC!?
cols_to_drop = [col for col in study_features_cleaned_df.columns if 'chn_' in col and int(col.split('_')[1]) > 63]
study_features_cleaned_df.drop(columns=cols_to_drop, inplace=True)

# Drop all columns containing 'error' or 'r_squared'
# TODO: This doesn't make much difference to the predictions
cols_to_drop = [col for col in study_features_cleaned_df.columns if 'error' in col or 'r_squared' in col]
study_features_cleaned_df.drop(columns=cols_to_drop, inplace=True)

# Drop all columns containing 'cf', 'bw', or 'pw'
# TODO: This significantly reduces recall and AUC, false negatives and false positives are increased
# cols_to_drop = [col for col in study_features_cleaned_df.columns if any(x in col for x in ['cf', 'bw', 'pw'])]
# study_features_cleaned_df.drop(columns=cols_to_drop, inplace=True)

# TODO: Dropping all peiodic other than CF doesn't make much difference!!
# cols_to_drop = [col for col in study_features_cleaned_df.columns if any(x in col for x in ['bw', 'pw'])]
# study_features_cleaned_df.drop(columns=cols_to_drop, inplace=True)

# Drop all columns containing 'offset' or 'exponent'
# TODO: This reduces AUC and false positives are increased
# cols_to_drop = [col for col in study_features_cleaned_df.columns if 'offset' in col or 'exponent' in col]
# study_features_cleaned_df.drop(columns=cols_to_drop, inplace=True)

# Drop features
# # TODO: Increases false positives ....
# dropped_variables = ['gender']
# study_features_cleaned_df.drop(dropped_variables, axis=1, inplace=True)

# # TODO: Increases false negatives, redices recall. ...... is this to be expected given the PD in age, look at control group ages?
# dropped_variables = ['age']
# study_features_cleaned_df.drop(dropped_variables, axis=1, inplace=True)

# Drop features
dropped_variables = ['subject_id']
study_features_cleaned_df.drop(dropped_variables, axis=1, inplace=True)

# Before and After
print(eeg_results_features_superset_df.shape)
print(study_features_cleaned_df.shape)

# Data Split ...

In [None]:
# Separate features (X) and target variable (y)
targetName = "pd"
featureNames = study_features_cleaned_df.columns[study_features_cleaned_df.columns != targetName]

X = study_features_cleaned_df[featureNames]
y = study_features_cleaned_df[targetName]

# Split of training and testing data, 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

del targetName, featureNames, X, y

# Transforms  ....

In [None]:
# Establish a transformation for categorical and numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'bool']).columns

print(f'Numerics {len(numerical_features)} \n', numerical_features)
print(f'Categoricals {len(categorical_features)} \n', categorical_features)

transformations = [
    ('cat', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist'), categorical_features),
    # ('num', RobustScaler(), numerical_features) - more false positives
    # ('num', StandardScaler(), numerical_features) - AUC reduced
    # ('num', MinMaxScaler(), numerical_features) -more false positives & AUC reduced
    ('num', 'passthrough', numerical_features)
]

# Add to pipeline, and later add other actions such as dropping rows, imputing etc etc
data_prep_pipeline = Pipeline([
    #('imputer', SimpleImputer(strategy='mean')),
    ('col_transform', ColumnTransformer(transformers=transformations))
])
data_prep_pipeline.fit(X_train)

In [None]:
# Transform the separate datasets
X_train_transformed = data_prep_pipeline.transform(X_train)
X_test_transformed = data_prep_pipeline.transform(X_test)

In [None]:
display(f'Original: {eeg_results_features_superset_df.shape}')
display(f'Cleaned: {study_features_cleaned_df.shape}')

display(f'X_Train: {X_train_transformed.shape}')
display(f'X_Test: {X_test_transformed.shape}')
display(data_prep_pipeline.get_feature_names_out())
feature_names = data_prep_pipeline.get_feature_names_out()