# Tools

# Imports

In [2]:
# General imports
import os
import sys
import gc
import warnings
from typing import Literal

from datetime import datetime
from pprint import pprint
import time
import pickle
import random
from collections import Counter

# Custom Functions
sys.path.append(os.path.abspath('../Notebooks/Utilities')) 
import cust_utilities as utils

# Maths, Pandas etc
import math
import numpy as np
import pandas as pd
import scipy as sci

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.backends.backend_pdf import PdfPages

# ML Prep
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_selector

# ML Training
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Features DF Tidy Up

In [3]:
# Get Original Data

#---- Parameters --------------------------------
# Study & Processing Run Details
study_name = 'IOWA_Simon'
eeg_features_run = '1b_EEG_Features_Results_Run_20250726_full_run'

#----------------------------------------------------

# Get existing study details, if exists
study_folder_path = utils.get_folder_path('Study_' + study_name)
study_info = pd.read_pickle(study_folder_path + '/study_inf.pkl', compression='zip')
study_subjects_df = pd.read_pickle(study_folder_path + '/study_subjects_df.pkl', compression='zip')

# Processing Results Data
eeg_features_run_results_path = utils.get_folder_path(study_info['eeg_processing_results_path'] + '/' + eeg_features_run)
eeg_features_run_details = pd.read_pickle(eeg_features_run_results_path + '/run_details.pkl', compression='zip')
eeg_preprocessing_run = eeg_features_run_details['eeg_preprocessed_data']

eeg_preprocessing_run_results_path = utils.get_folder_path(study_info['eeg_processing_results_path'] + '/' + eeg_preprocessing_run)
eeg_preprocessed_data_path = utils.get_folder_path(eeg_preprocessing_run_results_path + '/Cleaned_files' )
eeg_preprocessing_run_details = pd.read_pickle(eeg_preprocessing_run_results_path + '/run_details.pkl', compression='zip')
eeg_processing_results_df = pd.read_pickle(eeg_features_run_results_path + '/eeg_processing_results_df.pkl', compression='zip')
eeg_features_superset_df = pd.read_pickle(eeg_features_run_results_path + '/eeg_features_superset_df.pkl', compression='zip')

# Processing Run Details & Data Structures
summary = f'EEG Processing Parameters'
summary = summary + f"\n- Study: {study_info['study_name']} {study_info['dataset_ref']}"
summary = summary + f"\n- EEG Processing Run: {eeg_preprocessing_run_details['run_name']}"
summary = summary + f"\n-   Preprocess Params: {eeg_preprocessing_run_details['preprocess_params']}"
summary = summary + f"\n-   ICA Params: {eeg_preprocessing_run_details['artefact_params']}"
summary = summary + f"\n- EEG Features Run: {eeg_features_run}"
summary = summary + f"\n-   PSD Params: {eeg_features_run_details['psd_params']}"
summary = summary + f"\n-   SpecParam Params: {eeg_features_run_details['specparam_params']}"
print(f'{summary}\n')


EEG Processing Parameters
- Study: IOWA_Simon ds004580-1.0.0
- EEG Processing Run: 1a_EEG_Preprocessing_Run_20250724_full_ica
-   Preprocess Params: {'band_pass_lf': 1, 'band_pass_hf': 100, 'band_pass_method': 'iir', 'phase': 'zero', 'linear_detrend': 'linear', 'channel_referencing': 'average'}
-   ICA Params: {'ica_method': 'infomax', 'ICA_rejection_threshold': 0.8}
- EEG Features Run: 1b_EEG_Features_Results_Run_20250726_full_run
-   PSD Params: {'method': 'welch', 'fmin': 1, 'fmax': 100, 'exclude': []}
-   SpecParam Params: {'peak_width_limits': [1, 12], 'max_n_peaks': 6, 'min_peak_height': 0.1, 'peak_threshold': 2.0, 'aperiodic_mode': 'fixed', 'fit_window': [2, 40], 'fit_error_threshold': 0.1, 'fit_r2_threshold': 0.9}



In [4]:
# Temp to Amend the Features Superset df with Regions / Channels separate

# TODO: Amend EEG Extract to do this directly

valid_regions = ['frontal', 'central', 'posterior']

eeg_features_superset_df['region'] = eeg_features_superset_df['channel'].where(
    eeg_features_superset_df['channel'].isin(valid_regions), np.nan)
eeg_features_superset_df.loc[eeg_features_superset_df['channel'].isin(valid_regions), 'channel'] = np.nan
cols = list(eeg_features_superset_df.columns)
cols.insert(cols.index('subject_id') + 1, cols.pop(cols.index('region')))
eeg_features_superset_df = eeg_features_superset_df[cols]

# Save it to temp df alongside original
eeg_features_superset_df.to_pickle(eeg_features_run_results_path + '/temp_eeg_features_superset_df.pkl', compression='zip') 


In [5]:
# Temp to generate a flattened combined subjects df and features df
# So one row per subject with several hundred features

# TODO: Put at the end of the EEG extract to generate store initially

def combine_subjects_features(subjects_df, features_df):
    non_feature_cols = ['subject_id', 'region', 'channel']

    subjects_features = []
    subjects = subjects_df['subject_id'].unique()
    if 'study_name' in subjects_df.columns:
        subjects_df = subjects_df.drop(columns=['study_name'])

    for subj_id in subjects:
        subj_meta_dict = subjects_df.loc[subjects_df['subject_id'] == subj_id].iloc[0].to_dict()
        subj_features_df = features_df[features_df['subject_id'] == subj_id]
        if subj_features_df.empty:
            continue
        
        # Subject ID & Meta data
        row_dict = {'subject_id': subj_id}
        row_dict.update(subj_meta_dict)

        # Flattened Region & Channel data
        for _, next_row in subj_features_df.iterrows():
            region = next_row['region']
            channel = next_row['channel']
            prefix = f'region_{region}_' if pd.notna(region) else f'channel_{channel}_'
            for col in next_row.index:
                if col in non_feature_cols:
                    continue
                row_dict[f'{prefix}{col}'] = next_row[col]

        subjects_features.append(row_dict)
    
    combined_df = pd.DataFrame(subjects_features)
    return combined_df

eeg_features_flattened_df = combine_subjects_features(study_subjects_df, eeg_features_superset_df)
print('Combined Flattened Features DataFrame')
print(eeg_features_flattened_df.shape)
display(eeg_features_flattened_df.head())

# Save alongside original
eeg_features_flattened_df.to_pickle(eeg_features_run_results_path + '/eeg_features_flattened_df.pkl', compression='zip') 


Combined Flattened Features DataFrame
(147, 1478)


Unnamed: 0,subject_id,pd,age,gender,region_frontal_offset,region_frontal_exponent,region_frontal_cf_0,region_frontal_pw_0,region_frontal_bw_0,region_frontal_cf_1,...,channel_Iz_pw_3,channel_Iz_bw_3,channel_Iz_cf_4,channel_Iz_pw_4,channel_Iz_bw_4,channel_Iz_cf_5,channel_Iz_pw_5,channel_Iz_bw_5,channel_Iz_error,channel_Iz_r_squared
0,sub-001,1,80,M,-11.88452,0.82683,6.358789,0.546284,3.421636,10.805299,...,,,,,,,,,,
1,sub-002,1,81,M,-10.710919,1.458288,6.640354,0.606718,2.0,11.66597,...,,,,,,,,,,
2,sub-003,1,68,F,-11.275927,1.145069,11.154159,0.845334,12.0,,...,,,,,,,,,,
3,sub-004,1,80,M,-11.231566,1.410881,6.126451,0.855957,3.450809,,...,,,,,,,,,,
4,sub-005,1,56,M,-12.182214,0.317367,6.138914,0.308579,2.0,12.773196,...,,,,,,,,,,
