# 2a Features Extraction

Runs:
- Imports & Functions [Jump To](#run-0-all-imports-etc)
- Study Load & Inspections [Jump To](#run-1-study-load--inspections)
- Feature Selection [Jump To](#run-2-feature-selections)

To Do:
- ?? Rejection of channels, subjects ....
- ?? Use of canonical bands
- Data prep, save data results for ML training / ML Execution .... or pipeline rerun?

# Imports & Functions

## Imports

In [None]:
# General imports
import os
import sys
import gc
import warnings
from typing import Literal

from datetime import datetime
from pprint import pprint
import time
import pickle
import random
from collections import Counter

# Custom Functions
sys.path.append(os.path.abspath('../Notebooks/Utilities')) 
import cust_utilities as utils

# Maths, Pandas etc
import math
import numpy as np
import pandas as pd
import scipy as sci

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.backends.backend_pdf import PdfPages

# ML Prep
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import make_column_selector

# ML Training
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

# Random Forest
from sklearn.ensemble import RandomForestClassifier

## Results & Features Inspection

In [None]:
# Plots for subject info
#

def subject_info_plot(subjects_df):

    # PD & Gender
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
    fig.suptitle('Study Subjects - PD & Gender', fontsize=18)

    counts = subjects_df['pd'].value_counts()
    axes[0].set_title('PD')
    axes[0].bar(counts.index.astype(str), counts.values, color=['skyblue', 'skyblue'], edgecolor='black')
    axes[0].set_xticks(range(len(counts.index)))
    axes[0].set_xticklabels(['Yes', 'No'])
    for i, (label, count) in enumerate(counts.items()):
        axes[0].text(i, count/2, str(count), ha='center', va='center', fontsize=12)

    counts = subjects_df['gender'].value_counts()
    axes[1].set_title('Gender')
    axes[1].bar(counts.index.astype(str), counts.values, color=['skyblue', 'skyblue'], edgecolor='black')
    axes[1].set_xticks(range(len(counts.index)))
    axes[1].set_xticklabels(['Male', 'Female'])
    for i, (label, count) in enumerate(counts.items()):
        axes[1].text(i, count/2, str(count), ha='center', va='center', fontsize=12)

    plt.tight_layout()
    plt.show()

    # Age Distribution & Box
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))
    fig.suptitle('Study Subjects - Age Distribution', fontsize=18)

    axes[0].hist(subjects_df['age'], bins=15, color='skyblue', edgecolor='black')

    axes[1].set_xticks([0])
    box = axes[1].boxplot(subjects_df['age'].dropna(), patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='skyblue')

    plt.tight_layout()
    plt.show()


In [None]:
# Plots for EEG Preprocessing Results
#

def eeg_preprocess_results_plot(results_df):

    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 6))
    fig.suptitle('EEG Preprocessing Metrics - All Subjects', fontsize=18)

    # Quality Warning
    counts = results_df['EEG_preprocessing_quality_warning'].value_counts().reindex([True, False], fill_value=0)
    axes[0].set_title('Overall Quality Warning Count')
    axes[0].bar(counts.index.astype(str), counts.values, color=['salmon', 'lightgreen'], edgecolor='black')
    axes[0].set_xticks(range(len(counts.index)))
    axes[0].set_xticklabels(['Yes', 'No'])
    for i, (label, count) in enumerate(counts.items()):
        axes[0].text(i, count/2, str(count), ha='center', va='center', fontsize=12)

    # # Channels count
    # counts = results_df['channel_count'].value_counts()
    # axes[1].set_title('Channels Count')
    # axes[1].bar(counts.index.astype(str), counts.values, color=['skyblue'], edgecolor='black')

    # ICA Rejection Level
    axes[1].set_title('ICA - ICs Rejection Level')
    box = axes[1].boxplot(results_df['ICA_rejection_level'].dropna(), patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='skyblue')

    # Epoch Rejection Level
    axes[2].set_title('Epoch Rejection Level')
    box = axes[2].boxplot(results_df['epoch_rejection_level'].dropna(), patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='skyblue')
    # axes[2].set_xlabel('Epoch Rejection Level')
    # axes[2].set_ylabel('Number of Subjects')
    # axes[2].hist(results_df['epoch_rejection_level'], bins=10, color='skyblue', edgecolor='black')
    # nonzero_epoch_rejection = results_df['epoch_rejection_level'][results_df['epoch_rejection_level'] > 0]
    # axes[2].hist(nonzero_epoch_rejection, bins=15, color='salmon', edgecolor='black', alpha=0.7)
    # for bar in axes[2].patches:
    #     bar.set_width(bar.get_width() * 0.8)

    plt.tight_layout()
    plt.show()


In [None]:
# Plots for EEG SpecParam Results
#

def eeg_specparam_results_plot(results_df):

    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16, 6))
    fig.suptitle('EEG SpecParam Fit Metrics - All Subjects', fontsize=18)

    # Quality Warning
    counts = results_df['chn_SPM_fit_quality_warning'].value_counts().reindex([True, False], fill_value=0)
    axes[0].set_title('Overall Quality Warning Count')
    axes[0].bar(counts.index.astype(str), counts.values, color=['salmon', 'lightgreen'], edgecolor='black')
    
    axes[0].set_xticks(range(len(counts.index)))
    axes[0].set_xticklabels(['Yes', 'No'])
    for i, (label, count) in enumerate(counts.items()):
        axes[0].text(i, count/2, str(count), ha='center', va='center', fontsize=12)

    # Error Mean
    axes[1].set_title('Error Mean')
    box = axes[1].boxplot(results_df['chn_error_mean'].dropna(), patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='skyblue')

    # R-Squared Mean
    axes[2].set_title('R2 Mean')
    box = axes[2].boxplot(results_df['chn_r2_mean'].dropna(), patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='skyblue')

    plt.tight_layout()
    plt.show()

    # Number of flagged channels
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(3, 4))
    # fig.suptitle('EEG SpecParam Fit Metrics - Flagged Channels', fontsize=18)
    flagged_counts = results_df['chn_flagged_channels'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    axes.set_title('Flagged Channels Count')
    box = axes.boxplot(flagged_counts, patch_artist=True)
    for patch in box['boxes']:
        patch.set(facecolor='salmon')

    plt.tight_layout()
    plt.show()

# Run: 0. All Imports etc

In [None]:
# Dummy - Run All above

# Run: 1. Study Load & Inspections

In [None]:
# Study and Processing Run Details

#---- Parameters --------------------------------
# Study & Processing Run Details
study_name = 'IOWA_Rest'
eeg_features_run = '1b_EEG_Features_Results_Run_20250724_full_run'

run_description = 'test_extraction'
test_mode = True

# Extraction Parameters
extraction_params = {}
# TODO: Add selection etc parameters here
#----------------------------------------------------

# Get existing study details, if exists
study_folder_path = utils.get_folder_path('Study_' + study_name)
study_info = pd.read_pickle(study_folder_path + '/study_inf.pkl', compression='zip')
study_subjects_df = pd.read_pickle(study_folder_path + '/study_subjects_df.pkl', compression='zip')

# Processing Results Data
eeg_features_run_results_path = utils.get_folder_path(study_info['eeg_processing_results_path'] + '/' + eeg_features_run)
eeg_features_run_details = pd.read_pickle(eeg_features_run_results_path + '/run_details.pkl', compression='zip')
eeg_preprocessing_run = eeg_features_run_details['eeg_preprocessed_data']

eeg_preprocessing_run_results_path = utils.get_folder_path(study_info['eeg_processing_results_path'] + '/' + eeg_preprocessing_run)
eeg_preprocessed_data_path = utils.get_folder_path(eeg_preprocessing_run_results_path + '/Cleaned_files' )
eeg_preprocessing_run_details = pd.read_pickle(eeg_preprocessing_run_results_path + '/run_details.pkl', compression='zip')
eeg_processing_results_df = pd.read_pickle(eeg_features_run_results_path + '/eeg_processing_results_df.pkl', compression='zip')
eeg_features_superset_df = pd.read_pickle(eeg_features_run_results_path + '/eeg_features_superset_df.pkl', compression='zip')

# Setup the extraction run and results folder & save params
current_date = datetime.now().strftime('%Y%m%d')
run_name = f'2a_Feature_Extraction_Data_Run_{current_date}_{run_description}'
run_results_path = utils.extend_folder_path(study_info['ml_training_results_path'], run_name, exists_ok=False)

run_details = pd.Series({
    'study_name': study_name,
    'run_name': run_name,
    'extraction_params': extraction_params,
})
run_details.to_pickle(run_results_path + '/run_details.pkl', compression='zip')

# Set progress messages, testing
if test_mode:
    VERBOSE = True
    TEST_SUBJECTS = [0,5,101]
    # TEST_CHANNELS = ['F5', 'C3', 'P3', 'F6', 'C6', 'P6']
else:
    VERBOSE = False
    TEST_SUBJECTS = []
    # TEST_CHANNELS = []

In [None]:
# Processing Run Details & Data Structures
summary = f'EEG Processing Parameters'
summary = summary + f"\n- Study: {study_info['study_name']} {study_info['dataset_ref']}"
summary = summary + f"\n- EEG Processing Run: {eeg_preprocessing_run_details['run_name']}"
summary = summary + f"\n-   Preprocess Params: {eeg_preprocessing_run_details['preprocess_params']}"
summary = summary + f"\n-   ICA Params: {eeg_preprocessing_run_details['artefact_params']}"
summary = summary + f"\n- EEG Features Run: {eeg_features_run}"
summary = summary + f"\n-   PSD Params: {eeg_features_run_details['psd_params']}"
summary = summary + f"\n-   SpecParam Params: {eeg_features_run_details['specparam_params']}"
summary = summary + f"\n- Features Extraction Run: {run_name}"
summary = summary + f"\n-   Feature Extraction Params: {run_details['extraction_params']}"
print(f'{summary}\n')

# Processing Metrics
print('EEG Processing Results')
print(eeg_processing_results_df.shape)
display(eeg_processing_results_df.head())

print(f'Null Fits: {sum(eeg_processing_results_df["chn_null_fits"])}')
eeg_preprocess_results_plot(eeg_processing_results_df)
eeg_specparam_results_plot(eeg_processing_results_df)


In [None]:
# Study Subjects Summary
print('Study Subjects')
print(study_subjects_df.shape)
display(study_subjects_df.head())

subject_info_plot(study_subjects_df)

In [None]:
# Combine eeg_features_superset_df with study_subjects_df so each subject has one row
# and columns are named with channel/region prefixes

def combine_features_with_subjects(features_df, subjects_df, valid_regions):
    subjects = subjects_df['subject_id'].unique()
    combined_rows = []
    for subj_id in subjects:
        subj_features = features_df[features_df['subject_id'] == subj_id]
        row = {'subject_id': subj_id}
        # Add subject meta info
        subj_meta = subjects_df.loc[subjects_df['subject_id'] == subj_id, ['pd', 'age', 'gender']].iloc[0]
        row.update(subj_meta.to_dict())
        # For each channel/region, add features with prefix
        for _, feat_row in subj_features.iterrows():
            ch = feat_row['channel']
            prefix = f"{ch}_" if ch not in valid_regions else f"{ch}_region_"
            for col in feat_row.index:
                if col in ['subject_id', 'channel']:
                    continue
                row[f"{prefix}{col}"] = feat_row[col]
        combined_rows.append(row)
    combined_df = pd.DataFrame(combined_rows)
    return combined_df

combined_features_df = combine_features_with_subjects(eeg_features_superset_df, study_subjects_df, valid_regions)
print('Combined Features DataFrame')
print(combined_features_df.shape)
display(combined_features_df.head())

In [None]:
# Combine the subject 'meta' df and EEG features df
# With one row per subject and seversl hundred 'features'

def combine_subjects_features(subjects_df, features_df):
    subjects_features = []

    subjects = subjects_df['subject_id'].unique()
    for subj_id in subjects:
        subj_meta = subjects_df.loc[subjects_df['subject_id'] == subj_id].iloc[0].to_dict()
        subj_features = features_df[features_df['subject_id'] == subj_id]

        # Subject ID & Meta data
        row_dict = {'subject_id': subj_id}
        row_dict.update(subj_meta)

        # Flattened Region & Channel data



        subjects_features.append(row_dict)
    
    combined_df = pd.DataFrame(subjects_features)
    return combined_df


test = combine_subjects_features(study_subjects_df, eeg_features_superset_df)
print('Combined Features DataFrame')
print(test.shape)
display(test.head())

Combined Features DataFrame
(149, 5)


Unnamed: 0,subject_id,study_name,pd,age,gender
0,sub-001,IOWA_Rest,1,80,M
1,sub-002,IOWA_Rest,1,81,M
2,sub-003,IOWA_Rest,1,68,F
3,sub-004,IOWA_Rest,1,80,M
4,sub-005,IOWA_Rest,1,56,M


In [None]:
# Features Superset 
print('EEG Features Superset')
print(eeg_features_superset_df.shape)
display(eeg_features_superset_df.head())


In [None]:
# SpecParam Results

# TODO: Summary of features? What detail ............ 

# aperiodic_components_plot(eeg_features_superset_df)
# periodic_components_plot(eeg_features_superset_df)
# Adapt from xx Features Extraction, xx Model Training


# New Feature Selection

In [None]:
# Feature Selection using a Custom Transformer class
#

from sklearn.base import BaseEstimator, TransformerMixin

class FeaturesSelection(BaseEstimator, TransformerMixin):

    def __init__(self, features_detail_level):
        # Parameters for the selection
        self.features_detail_level = features_detail_level
    
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X_temp_df = X.copy()
        else:
            raise ValueError("X must be a pandas DataFrame for feature selection.")
        
        # Feature selection training / fit

        # TODO: Temp test
        print(f'Trace: Feature Selection with: {self.features_detail_level}')
        temp_cols_to_retain = ['subject_id', 'channel'] + ['exponent']
        X_temp_df = X_temp_df[[col for col in X_temp_df.columns if any(name in col for name in temp_cols_to_retain)]]
        print(f'Trace Selected DF')
        print(X_temp_df.head())

        self.selected_features_ = list(X_temp_df)
        print(f'Trace: Selected Features {self.selected_features_}')
        return self
    
    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X_temp_df = X.copy()
        else:
            raise ValueError("X must be a pandas DataFrame for feature selection.")
        
        # Apply feature selection
        return X[self.selected_features_]


In [None]:
X_train = eeg_features_superset_df.copy()
X_test = eeg_features_superset_df.copy()

In [None]:
# Features to drop
features_to_drop = ['subject_id']
def drop_features(df, features):
    return df.drop(features, axis=1)
dropper = FunctionTransformer(drop_features, kw_args={'features': features_to_drop} )

# # Numerical Scaling & Cat Encoding Column Transformations
# temp = X_train.drop(features_to_drop, axis=1)
# numerical_cols = temp.select_dtypes(include=['float64', 'int64']).columns.tolist()
# categorical_cols = temp.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('scale_num', RobustScaler()) # TODO: vs standardscaler?
    ])
cat_pipeline = Pipeline([
    # ("impute", SimpleImputer(strategy="most_frequent")),
    # ("encode_cat", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ('encode_cat', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist'))
    ])
# cols_transform = ColumnTransformer([
#     ('numeric', num_pipeline, numerical_cols),
#     ('categorical', cat_pipeline, categorical_cols)
#     ])
cols_transform = ColumnTransformer([
    ('numeric', num_pipeline, make_column_selector(dtype_include=['float64', 'int64'])),
    ('categorical', cat_pipeline, make_column_selector(dtype_include='object'))
])

# Establish and train/fit the overall preparation pipeline
features_prep_pipeline = Pipeline([
    ('test_fs', FeaturesSelection('Regions')),
    ('drop_columns', dropper),
    ('data_preprocess', cols_transform)
    ])  
features_prep_pipeline.fit(X_train)

In [None]:
test_pipeline = Pipeline([
    ('test_fs', FeaturesSelection('Regions'))
])

test_pipeline.fit(eeg_features_superset_df)


In [None]:
X_train_transformed = test_pipeline.transform(eeg_features_superset_df)
X_test_transformed = test_pipeline.transform(eeg_features_superset_df)

# Run: 2. Feature Selection

In [None]:
#-----------
# TODO: Add selection to notebook parameters
FeatureDetailLevel = Literal['regions', 'channels']

feature_detail_level: FeatureDetailLevel = 'channels'  # regions or channels
# features_retained = ['offset', 'exponent']
# features_retained = ['cf', 'pw', 'bw']
features_retained = ['cf', 'pw', 'bw', 'offset', 'exponent']


# TODO: Add to constants or change Superset with two columns, 'Regions' and 'Channels' then select on that
valid_regions = ['frontal', 'central', 'posterior']
id_columns = ['subject_id', 'channel']
#-----------

# TODO: Add more sophisticated selection, eg PCA etc?
# TODO: Put this into a custom classifier class for use in a pipeline? Benefits? 
# TODO: Compare the impact of different feature selections, all, periodic, aperiodic, region, channel etc?
# TODO: Even do this in a grid search but with a static model to avoid huge permutations and running time

# Select Specific Features from the EEG Features Superset
# Create selected features, flattened to one row/vector per subject
#

# Iterate through all subjects and filter features, rename columns etc
#

subjects = study_subjects_df['subject_id'].unique()
columns_retained = id_columns + features_retained
subjects_features = []
for next_subj in subjects:
    subj_df = eeg_features_superset_df[eeg_features_superset_df['subject_id'] == next_subj]

    # Retain defined level of detail: regions or channels
    if feature_detail_level == 'regions':
        subj_df = subj_df[subj_df['channel'].isin(valid_regions)]
    else:
        subj_df = subj_df[~subj_df['channel'].isin(valid_regions)]
    
    # Drop columns not in retained features
    subj_df = subj_df[[col for col in subj_df.columns if any(name in col for name in columns_retained)]]
    
    # Iterate through subject channels to combine into one row
    row_dict = {'subject_id': next_subj}
    channels = subj_df['channel'].unique()
    for next_channel in channels:
        ch_row = subj_df[subj_df['channel'] == next_channel]
        for col_name in subj_df.columns:
            if not any([col_name.startswith(feat) for feat in features_retained]):
                continue
            row_dict[f'{next_channel}_{col_name}'] = ch_row[col_name].item()
    
    # Add the subject meta data
    subj_meta = study_subjects_df.loc[study_subjects_df['subject_id'] == next_subj, ['pd', 'age', 'gender']].iloc[0]
    row_dict.update({'pd': subj_meta['pd'], 'age': subj_meta['age'], 'gender': subj_meta['gender']})
    
    subjects_features.append(row_dict)

# Create slected features dataframe
features_selection_df = pd.DataFrame(subjects_features)
print('Features Selection')
print(features_selection_df.shape)
display(features_selection_df.head())

# TODO: Save features selection df .... for rerunning ... or use a saved scikit pipeline pipeline custom classifier?


# Run: 3. Data Prep

In [None]:
# Separate X features and y target
# TODO: Separate notebook and load selected fetaures df?

target_col_name = 'pd'
feature_names = features_selection_df.columns[features_selection_df.columns != target_col_name]
X = features_selection_df[feature_names].copy()
y = features_selection_df[target_col_name].copy()

# Data Split : Training & Test, 80:20. NB cross-validation will be performed using Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TODO: Different split?
# split by *subjects*, not by raw rows
# train_subj, test_subj = train_test_split(subjects, stratify=labels,
#                                          test_size=.3, random_state=42)
# X_train = eeg_long[eeg_long.subject_id.isin(train_subj)]
# X_test  = eeg_long[eeg_long.subject_id.isin(test_subj)]
# y_train = labels.loc[train_subj].values
# y_test  = labels.loc[test_subj].values

In [None]:
# Features Cleaning - Pipeline Setup
#

# TODO: Compare no scaling at all and model performance

# Features to drop
features_to_drop = ['subject_id']
def drop_features(df, features):
    return df.drop(features, axis=1)
dropper = FunctionTransformer(drop_features, kw_args={'features': features_to_drop} )

# # Numerical Scaling & Cat Encoding Column Transformations
# temp = X_train.drop(features_to_drop, axis=1)
# numerical_cols = temp.select_dtypes(include=['float64', 'int64']).columns.tolist()
# categorical_cols = temp.select_dtypes(include=['object']).columns.tolist()

num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('scale_num', RobustScaler()) # TODO: vs standardscaler?
    ])
cat_pipeline = Pipeline([
    # ("impute", SimpleImputer(strategy="most_frequent")),
    # ("encode_cat", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ('encode_cat', OneHotEncoder(drop='first', handle_unknown='infrequent_if_exist'))
    ])
# cols_transform = ColumnTransformer([
#     ('numeric', num_pipeline, numerical_cols),
#     ('categorical', cat_pipeline, categorical_cols)
#     ])
cols_transform = ColumnTransformer([
    ('numeric', num_pipeline, make_column_selector(dtype_include=['float64', 'int64'])),
    ('categorical', cat_pipeline, make_column_selector(dtype_include='object'))
])

# Establish and train/fit the overall preparation pipeline
features_prep_pipeline = Pipeline([
    # TODO: Feature Selection custom class?
    ('drop_columns', dropper),
    ('data_preprocess', cols_transform)
    ])  
features_prep_pipeline.fit(X_train)

# TODO: Save pipleline for later reuse alongside the trained ML model


In [None]:
# Apply the transformations to training data

X_train_transformed = features_prep_pipeline.transform(X_train)
X_test_transformed = features_prep_pipeline.transform(X_test)

In [None]:
# Checkpoint - Tranformation Results

# TODO: Examine the impact of scaling etc more closely eg some boxplots before and after .... how well has eg scaling worked?

# Before & After
print("Features Extraction / Transformed Data")
print(f'- Original Features Selection: {features_selection_df.shape}')
print(f'- Original X_train: {X_train.shape} and y_train: {y_train.shape}')
print(f'- Original X_test: {X_test.shape} and y_test: {y_test.shape}')
print(f'- Transformed X_train: {X_train_transformed.shape}')
print(f'- Transformed X_test: {X_test_transformed.shape}')
print(f'- Num feature names from pipeline: {len(cols_transform.get_feature_names_out())}')

# Feature Names
# print("\nFeature Names")
# print("Numerical:", len(numerical_cols), numerical_cols[:5] if numerical_cols else "None")
# print("Categorical:", len(categorical_cols), categorical_cols[:5] if categorical_cols else "None")

print("\nTransformed")
temp_names_X_train_df = pd.DataFrame(X_train_transformed, columns=cols_transform.get_feature_names_out())
temp_names_X_train_df.reset_index(drop=True, inplace=True)
# print(list(temp_names_X_train_df.columns[:]))
# print("\nHead of X_train_transformed:")
display(temp_names_X_train_df.head())

# Check the pipeline structure
print("Pipeline transformers:")
for name, transformer, columns in cols_transform.transformers_:
    print(f"- {name}: {len(columns) if hasattr(columns, '__len__') else 'Unknown'} columns")


In [None]:
# Checkpoint - Pipeline

print("Feature Extraction Pipeline Steps:")
for name, step in features_prep_pipeline.named_steps.items():
    print(f"- {name}: {step}")

print("\nColumnTransformer Details:")
ct = features_prep_pipeline.named_steps['data_preprocess']
for name, trans, cols in ct.transformers_:
    print(f"- Transformer: {name}")
    print(f"    Columns: {cols}")
    print(f"    Transformer object: {trans}\n")

print("\nAll Pipeline Parameters")
for param, value in features_prep_pipeline.get_params().items():
    print(f"- {param}: {value}")

In [None]:
# # Check for problematic data
# print("Numerical data info:")
# print(X_train[numerical_cols].describe())
# print("\nAny infinite values?", np.isinf(X_train[numerical_cols]).any().any())
# print("Any NaN values?", X_train[numerical_cols].isnull().any().any())

# # Check if all numerical columns have the same values (zero variance)
# from sklearn.feature_selection import VarianceThreshold
# var_selector = VarianceThreshold(threshold=0)
# var_selector.fit(X_train[numerical_cols])
# print("Columns with zero variance:", 
#       [col for col, keep in zip(numerical_cols, var_selector.get_support()) if not keep])

# 4. Model Training

In [None]:
# Function to Display The Model Fit Results

def print_search_results(search, duration):
    print('------- Search Results --------')
    all_search_results = pd.DataFrame(search.cv_results_)
    print(f"Score: {search.best_score_:.4f}. Mean: {np.mean(all_search_results['mean_test_score']):.4f} and STD {np.std(all_search_results['mean_test_score']):.4f}")
    print(f'Search Took: {duration:.2f} seconds')
    print(f"Best Parameters: {search.best_params_}")
    
    top_n = 10
    print(f"Top {top_n} out of {len(all_search_results)} combinations:")
    display(all_search_results[['rank_test_score', 'mean_test_score', 'mean_fit_time', 'mean_score_time', 'params']].sort_values(by='rank_test_score').head(top_n))


In [None]:
# Function to Present the Evaluation Metrics for a Classification Model

def classification_metrics(for_Model, X_test, y_test, y_pred):
    plt.style.use('default')

    # Calculate Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Print various metrics
    print(f'Accuracy: {metrics.accuracy_score(y_true=y_test, y_pred=y_pred):.4f}')
    print(f'Precision: {metrics.precision_score(y_true=y_test, y_pred=y_pred, pos_label=1):.4f}')
    print(f'Recall: {metrics.recall_score(y_true=y_test, y_pred=y_pred, pos_label=1):.4f}')
    print(f'F1 Score {metrics.f1_score(y_true=y_test, y_pred=y_pred, pos_label=1):.4f}')
    print(f'Specificity: {tn / (tn + fp):.4f}')
    print(f'Hamming Loss {metrics.hamming_loss(y_true=y_test, y_pred=y_pred):.4f}')


    # Plot Confusion Matrix
    class_labels = for_Model.classes_
    fig, ax = plt.subplots(figsize=(12,4))
    ax.set_title('Confusion Matrix')
    ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels).plot(ax=ax)
    plt.show

    y_probabilities = for_Model.predict_proba(X_test)[:, 1]
    roc_auc_score = metrics.roc_auc_score(y_true=y_test, y_score=y_probabilities)
    print(f'ROC-AUC Score {roc_auc_score:.4f}')
    gini_score = 2 * roc_auc_score - 1
    print(f'Gini Index: {gini_score:.4f}')

    # Plot the ROC curve
    fig, ax = plt.subplots(figsize=(6,4))
    ax.set_title('ROC Curve')
    roc_display = RocCurveDisplay.from_estimator(for_Model, X_test, y_test, ax=ax, pos_label=1)
    plt.show()

    plt.style.use('ggplot')

In [None]:
# Establish a Model Pipeline - Using Processed Data
#

# Pipeline, params & grid search define
model_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1, verbose=False))
    ])

grid_params = {
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],     # Default gini. Tree split evaluation function
    'classifier__n_estimators': [150, 175],                       # Default 100. Number of trees
    'classifier__max_depth': [2, 10, None],                          # Default none, unlimited
    'classifier__max_leaf_nodes': [50, None],                     # Default none, unlimited
    # 'randomforestclassifier__min_samples_split': [2, 5],
    # 'classifier__class_weight': [None, 'balanced']
                            # Balanced gives more importance to minority classes ... ?? Improves recall at the expense of precision
    } 

grid_search = GridSearchCV(
    model_pipeline, grid_params, 
    cv=5,
    scoring='f1'
    
    )


In [None]:
# Grid search run

start_time = time.perf_counter()
grid_search.fit(X_train_transformed, y_train)
duration = time.perf_counter() - start_time



In [None]:
# Checkpoint - ML Pipeline

# print("Feature Extraction Pipeline Steps:")
# for name, step in model_pipeline.named_steps.items():
#     print(f"- {name}: {step}")

print("\nAll Pipeline Parameters")
for param, value in model_pipeline.get_params().items():
    print(f"- {param}: {value}")

In [None]:
# Grid search results
print_search_results(grid_search, duration)

# Get the Best Model & Calculate Predicted Y and Evaluate
model_randforest = grid_search.best_estimator_
y_pred = model_randforest.predict(X_test_transformed)
classification_metrics(model_randforest, X_test_transformed, y_test, y_pred)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


In [None]:
# Get feature importances
importances = model_randforest.named_steps['classifier'].feature_importances_

# Map feature importances to transformed feature names
transformed_feature_names = cols_transform.get_feature_names_out()
importance_df = pd.DataFrame({
    'Feature': transformed_feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
# Keep only the top 25 most important features
importance_df = importance_df.head(25)

# print(importance_df)

# Plot the feature importances with names horizontally
plt.figure(figsize=(12, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances (Sorted)')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()