# Imports

In [1]:
# General imports
import os
import sys
import glob
from pathlib import Path
from datetime import datetime

# Custom Functions
sys.path.append(os.path.abspath('../Notebooks/Utilities')) 
import cust_utilities as utils

# Maths, Pandas etc
import math
import numpy as np
import pandas as pd
import scipy as sci

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.backends.backend_pdf import PdfPages

# Specialised
import mne


# Model Comparisons

In [2]:
# Get all the model evaluation run results and combine into a single df
#

type = 'models'
type = 'predictions'

data_folder_path = utils.get_folder_path('Model Comparisons')
file_name_pattern =  f'**/{type}_results_df**.pkl'
# file_name_pattern = '**/models_results_df**.pkl'
data_files = glob.glob(str(Path(data_folder_path) / file_name_pattern), recursive=True)

df_list = []
for next_file in data_files:
    df = pd.read_pickle(next_file, compression='zip')
    df_list.append(df)
models_evaluations_df = pd.concat(df_list, ignore_index=True)

print(models_evaluations_df.shape)
display(models_evaluations_df.head())


(4, 16)


Unnamed: 0,study,training_source_data_run,training_results_run,search_features_detail,search_features_selection,CV_search_time,CV_best_parameters,features_detail,features_selection,model_name,prediction_time,mcc,recall,precision,f1_score,specificity
0,IOWA_Simon,empty,empty,empty,empty,0,"{'classifier__C': 100, 'classifier__class_weig...",region,cf,LogisticRegression_v1,0.013465,0.493238,0.939394,0.788136,0.857143,0.479167
1,UNM_Oddball,empty,empty,empty,empty,0,"{'classifier__C': 100, 'classifier__class_weig...",region,cf,LogisticRegression_v1,0.023279,0.089087,0.76,0.527778,0.622951,0.32
2,UNM_Oddball,empty,empty,empty,empty,0,"{'classifier__criterion': 'entropy', 'classifi...",channel,"[cf, pw, bw]",RandomForest_v1,0.096132,0.257248,0.8,0.588235,0.677966,0.44
3,IOWA_Simon,empty,empty,empty,empty,0,"{'classifier__criterion': 'entropy', 'classifi...",channel,"[cf, pw, bw]",RandomForest_v1,0.063645,0.310927,0.818182,0.764151,0.790244,0.479167


In [3]:
# Export The Entire DF to CSV

timestamp = datetime.now().strftime("%Y%m%d_%H%M")
csv_filename = f"Model_Comparisons_DF_{type}_{timestamp}.csv"
csv_filepath = os.path.join(data_folder_path, csv_filename)
if os.path.exists(csv_filepath):
    raise FileExistsError(f'File Exists: {csv_filepath}')

models_evaluations_df.to_csv(csv_filepath, index=False)

print(f"Successfully exported to CSV: {csv_filename}")
print(f"Total rows exported: {len(models_evaluations_df)}")
print(f"File location: {csv_filepath}")

Successfully exported to CSV: Model_Comparisons_DF_predictions_20250816_1519.csv
Total rows exported: 4
File location: /Users/stuartgow/GitHub/EEG_ML_Pipeline/Data/Model Comparisons/Model_Comparisons_DF_predictions_20250816_1519.csv


# Importance & Topographic Map

In [None]:
# Load SHAP importance CSV file into a df

data_folder_path = utils.get_folder_path('Model Comparisons')
shap_csv_pattern = '**/shap_importance_df**.csv'
shap_files = glob.glob(str(Path(data_folder_path) / shap_csv_pattern), recursive=True)
importance_df = pd.read_csv(shap_files[0])

# Tidy up the fature names
if 'feature' in importance_df.columns:
    importance_df['feature'] = importance_df['feature'].str.replace('numeric__channel_', '', regex=False)

print(importance_df.shape)
display(importance_df.head())


In [None]:
# Plot the top 25 Features

count = 25
plot_df = importance_df.head(count)
plt.figure(figsize=(12, 8))
plt.barh(plot_df['feature'], plot_df['importance_%'], color='skyblue')
plt.xlabel('Importance (%)')
plt.ylabel('Feature')
plt.title(f'Feature Contributions - Top {count}')
plt.gca().invert_yaxis() 
plt.show()

In [None]:
# Get the CF_0 features for a topgraphic plot

cf_df = importance_df[importance_df['feature'].str.contains('_cf_0', na=False)].copy()
if 'feature' in cf_df.columns:
    cf_df['feature'] = cf_df['feature'].str.replace('_cf_0', '', regex=False)

# Load standard montage & check electrodes overlap
montage = mne.channels.make_standard_montage('standard_1020')
available_electrodes = set(montage.ch_names)
target_electrodes = set(cf_df['feature'])
coverage = target_electrodes.intersection(available_electrodes)
print(f"Coverage: {len(coverage)}/{len(target_electrodes)} electrodes")



In [None]:
# Plot the Importance

# Extract electrode positions for topographic plotting
electrode_list = cf_df['feature'].tolist()
info = mne.create_info(
    ch_names=electrode_list,
    sfreq=1000,  # Dummy sampling frequency
    ch_types='eeg'
)
info.set_montage(montage)
pos = mne.find_layout(info).pos

# Generate topographic map
importance_values = cf_df['importance_%'].values
fig, ax = plt.subplots(figsize=(10, 8))
im, _ = mne.viz.plot_topomap(
    data=importance_values,
    pos=pos,  # Use positions instead of info
    axes=ax,
    cmap='RdYlBu_r',
    contours=6,
    show=False
)
# Enhance visualization
ax.set_title('Feature Contributions - Topographic Map')
plt.colorbar(im, label='Importance (%)')
plt.show()