# ML Applicaiton: Applying ML Models to the transition data

### Tasks included:
- Reading transition matrices 
- Reading P/AUC scores 
- Apply ML models 
- Select features and apply ML models 

## Imports and system info

In [1]:
import HumachLab_Global 
HumachLab_Global.get_system_info()


List of OS platforms and codes
___________________________________________
0 Darwin
1 Windows
2 Linux
===> "1 - Windows" OS is detected.

Processor (CPU) details: 
___________________________________________
{'python_version': '3.7.10.final.0 (64 bit)', 'cpuinfo_version': [8, 0, 0], 'cpuinfo_version_string': '8.0.0', 'arch': 'X86_64', 'bits': 64, 'count': 24, 'arch_string_raw': 'AMD64', 'vendor_id_raw': 'GenuineIntel', 'brand_raw': 'Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz', 'hz_advertised_friendly': '2.6000 GHz', 'hz_actual_friendly': '2.5940 GHz', 'hz_advertised': [2600000000, 0], 'hz_actual': [2594000000, 0], 'model': 58, 'family': 6, 'flags': ['3dnow', 'aes', 'apic', 'avx', 'clflush', 'cmov', 'cx16', 'cx8', 'de', 'dts', 'erms', 'f16c', 'fpu', 'fxsr', 'ht', 'hypervisor', 'ia64', 'lahf_lm', 'mca', 'mce', 'mmx', 'msr', 'mtrr', 'osxsave', 'pae', 'pat', 'pcid', 'pclmulqdq', 'pge', 'pni', 'popcnt', 'pse', 'pse36', 'rdrnd', 'sep', 'serial', 'smep', 'ss', 'sse', 'sse2', 'sse4_1', 'sse4_2

(1,
 'Windows',
 {'brand_raw': 'Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz',
  'arch_string_raw': 'AMD64',
  'arch': 'X86_64',
  'count': 24,
  'python_version': '3.7.10.final.0 (64 bit)',
  'CPU_usage': 8.1,
  'RAM_usage': 22.3,
  'Total_RAM': 256.0,
  'Used_RAM': 57.1,
  'Available_RAM': 198.9},
 [],
 None)

In [2]:
'''
Importing necessary modules
'''

import os
import sys
import datetime

print(os.getcwd())
os.chdir(os.getcwd())
print(f"{os.getcwd()}\HumachLab")
sys.path.append(f"{os.getcwd()}\HumachLab")
sys.path.insert(0, os.path.abspath('./HumachLab'))

import itertools as it
import re
import pickle
import json

import copy
from pprint import pprint

import glob

import math
import numbers

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import rc, rcParams
%matplotlib inline
import seaborn as sns

import HumachLab_Global
from HumachLab import * 
# from HumachLab.HumachLab_Global import *
# import HumachLab_Global
HumachLab_Global.get_system_info()

import mne

# plt.rcParams["figure.figsize"] = plt.rcParamsDefault["figure.figsize"]
plt.rcParams["figure.figsize"] = (20,6)

C:\Users\aliem\Desktop\aliem\My Research\HML_IHC_Sleep_Data_Analysis
C:\Users\aliem\Desktop\aliem\My Research\HML_IHC_Sleep_Data_Analysis\HumachLab

List of OS platforms and codes
___________________________________________
0 Darwin
1 Windows
2 Linux
===> "1 - Windows" OS is detected.

Processor (CPU) details: 
___________________________________________
{'python_version': '3.7.10.final.0 (64 bit)', 'cpuinfo_version': [8, 0, 0], 'cpuinfo_version_string': '8.0.0', 'arch': 'X86_64', 'bits': 64, 'count': 24, 'arch_string_raw': 'AMD64', 'vendor_id_raw': 'GenuineIntel', 'brand_raw': 'Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz', 'hz_advertised_friendly': '2.6000 GHz', 'hz_actual_friendly': '2.5940 GHz', 'hz_advertised': [2600000000, 0], 'hz_actual': [2594000000, 0], 'model': 58, 'family': 6, 'flags': ['3dnow', 'aes', 'apic', 'avx', 'clflush', 'cmov', 'cx16', 'cx8', 'de', 'dts', 'erms', 'f16c', 'fpu', 'fxsr', 'ht', 'hypervisor', 'ia64', 'lahf_lm', 'mca', 'mce', 'mmx', 'msr', 'mtrr', 'osxsave',

## Get directory list: Subject-wise

In [3]:
'''
Explore the contents/files in the directory
'''

def get_list_of_paths_from_a_directory(directory, path_type=None, containes=None, extension=None, exclude=None):
    '''
    directory: valid path string, path_type: p_file|p_dir, containes: string, extension: valid string file extension 
    '''
    os_path = os.path
    list_of_paths = []
        
    path_keywords = "*"
    if containes:
        path_keywords = f"{path_keywords}{containes}*"
    
    if extension:
        path_keywords = f"{path_keywords}.{extension}"
        
    complete_path = f"{directory}/{path_keywords}"
    print(f"============> {path_keywords}, {path_type}, {complete_path}")
    
    all_paths = glob.glob(complete_path) 
    all_temp_paths = None
    list_of_paths = None
    
    if path_type:
        if path_type=="p_file":
            all_temp_paths = [path.replace("\\", "/") for path in all_paths if (os_path.exists(path) and os_path.isfile(path))]
        if path_type=="p_dir":
            all_temp_paths = [path.replace("\\", "/") for path in all_paths if (os_path.exists(path) and os_path.isdir(path))]   
    else:
        all_temp_paths = [path.replace("\\", "/") for path in all_paths]
        
    if exclude:
        # print(all_temp_paths)
        # print(len(all_temp_paths), exclude)
        # list_of_paths = [path for path in all_temp_paths for ex in exclude if ex not in path]
        # list_of_paths = [path for ex in exclude for path in all_temp_paths if ex not in path]
        list_of_paths = [path for path in all_temp_paths if not any((ex in path) for ex in exclude)]
        # list_of_paths = [path for ex in exclude if any(ex not in path for path in all_temp_paths)]
        # any(substring in string for substring in substring_list)
        # print(len(list_of_paths))
    else:
        list_of_paths = all_temp_paths.copy()
    
    return list_of_paths

In [4]:
root_directory = "./Results/" 
data_directory = "/_Combined"
data_subdirectory = "Subject_One_Night"  ###"Subject_Combined_Record"  "Subject_Separate_Record"  "Subject_One_Night"  ## Change for new type of result
result_directory = "./Results/_Classification" 
metadata_subdirectory = ["CAP_Sleep", "Sleep_EDFX"] 
dataset_list = ["CAP_Sleep", "Sleep_EDFX"] 
tran_matrix_type = ["count", "dura", "proba"] 
annotation_type = ['annot', 'tran']
tran_level = 2
exclude_contents_in_dataset_directory = ["SHA256SUMS", "RECORDS"]
exclude_contents_in_result_directory = ["SHA256SUMS", "RECORDS", "all_annotations", "annot_sequence", "transition_sequence", "hypno", "DATASET_CHANGELOG"]
sleep_stage_labels = ['W', 'S1', 'S2', 'S3', 'S4', 'REM']
sleep_stage_labels_dict = {'W':0, 'S1':1, 'S2':2, 'S3':3, 'S4':4, 'REM':5}
sleep_stage_names_dict = {'W':0, 'S1':1, 'S2':2, 'S3':3, 'S4':4, 'R':5}
list_of_paths = None 

annot_type = annotation_type[0]

# directory = dataset_directory
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type=None, containes=None, extension=None, exclude=None) 
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type="p_file", containes=None, extension=None, exclude=None) 
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type="p_dir", containes=None, extension=None, exclude=None) 
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type=None, containes="nfle", extension=None, exclude=None) 
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type=None, containes=None, extension="edf", exclude=None) 
# list_of_paths = get_list_of_paths_from_a_directory(directory, path_type=None, containes=None, extension=None, exclude=exclude_contents_in_dataset_directory) 
# # list_of_paths = get_list_of_paths_from_a_directory(directory, path_type=None, containes=None, extension=None, exclude=None) 
# # pprint(list_of_paths)
# list_of_paths

### Get basic information

In [5]:
# # annot_type = annotation_type[0]

# all_demography_df = pd.DataFrame() 
# all_demography_detail_df = pd.DataFrame() 
# for dirr in metadata_subdirectory: 
#     demography_df = pd.read_csv(f"{data_directory}{dirr}/Demography.csv", index_col=False)
#     demography_df.insert(1, 'Dataset', [dirr]*demography_df.shape[0])
#     all_demography_df = pd.concat([all_demography_df, demography_df]) 
#     all_demography_df.reset_index(drop=True, inplace=True) 

#     demography_detail_df = pd.read_csv(f"{data_directory}{dirr}/Demography_Details.csv", index_col=False)
#     demography_detail_df.insert(0, 'Dataset', [dirr]*demography_detail_df.shape[0])
#     sub_name = demography_detail_df['File_Name'].str[:-1].values.tolist() if i==1 else demography_detail_df['File_Name'].values.tolist() 
#     demography_detail_df.insert(2, 'Subject_Name', sub_name)
#     all_demography_detail_df = pd.concat([all_demography_detail_df, demography_detail_df]) 
#     all_demography_detail_df.reset_index(drop=True, inplace=True)
    
# # list_of_tran_mat_paths

def get_metadata_info(info_type, annot_type): 
    all_demography_df = pd.DataFrame() 
    all_demography_detail_df = pd.DataFrame() 
    for i, dirr in enumerate(metadata_subdirectory): 
        demography_df = pd.read_csv(f"{root_directory}/{dirr}/Demography.csv", index_col=False)
        demography_df.insert(1, 'Dataset', [dirr]*demography_df.shape[0])
        all_demography_df = pd.concat([all_demography_df, demography_df]) 
        all_demography_df.reset_index(drop=True, inplace=True) 

        demography_detail_df = pd.read_csv(f"{root_directory}/{dirr}/Demography_Details.csv", index_col=False)
        demography_detail_df.insert(0, 'Dataset', [dirr]*demography_detail_df.shape[0])
        sub_name = demography_detail_df['File_Name'].str[:-1].values.tolist() if i==1 else demography_detail_df['File_Name'].values.tolist() 
        demography_detail_df.insert(2, 'Subject_Name', sub_name)
        all_demography_detail_df = pd.concat([all_demography_detail_df, demography_detail_df]) 
        all_demography_detail_df.reset_index(drop=True, inplace=True) 

    return all_demography_df, all_demography_detail_df 


info_type="file" ##"sub"/"file"  ## Change for new type of result
annot_type = annotation_type[1]   ## Change for different data preparation for 'annot' and 'tran' 

all_demography_df, all_demography_detail_df = get_metadata_info(info_type=info_type, annot_type=annot_type) 

In [6]:
all_demography_df

Unnamed: 0,#,Dataset,Category_Name,Category,Total_Count,Male_Count,Female_Count,Total_AgeRange,Male_AgeRange,Female_AgeRange
0,1,CAP_Sleep,Bruxism,brux,2,2,0,23 - 34,23 - 34,0 - 0
1,2,CAP_Sleep,Sleep-Disordered Breathing,sdb,4,4,0,65 - 78,65 - 78,0 - 0
2,3,CAP_Sleep,Insomnia,ins,9,4,5,47 - 82,54 - 82,47 - 59
3,4,CAP_Sleep,Narcolepsy,narco,5,2,3,18 - 44,24 - 43,18 - 44
4,5,CAP_Sleep,Nocturnal Frontal Lobe Epilepsy,nfle,40,21,19,14 - 67,14 - 44,16 - 67
5,6,CAP_Sleep,Periodic Leg Movements,plm,10,7,3,40 - 62,40 - 62,50 - 52
6,7,CAP_Sleep,REM Behavior Disorder,rbd,22,19,3,58 - 82,58 - 82,73 - 76
7,8,CAP_Sleep,No Pathology (Controls),n,16,7,9,23 - 42,23 - 34,24 - 42
8,10,CAP_Sleep,Total,,108,66,42,14 - 82,14 - 82,16 - 76
9,11,CAP_Sleep,Sleep Disorders,dis,92,59,33,14 - 82,14 - 82,16 - 76


In [7]:
all_demography_detail_df

Unnamed: 0,Dataset,File_Name,Subject_Name,Category,Subject_ID,Gender,Age
0,CAP_Sleep,brux1,brux1,brux,1,M,34
1,CAP_Sleep,brux2,brux2,brux,2,M,23
2,CAP_Sleep,sdb1,sdb1,sdb,1,M,65
3,CAP_Sleep,sdb2,sdb2,sdb,2,M,77
4,CAP_Sleep,sdb3,sdb3,sdb,3,M,78
...,...,...,...,...,...,...,...
300,Sleep_EDFX,ST7212,ST721,n,21,M,34
301,Sleep_EDFX,ST7221,ST722,n,22,F,56
302,Sleep_EDFX,ST7222,ST722,n,22,F,56
303,Sleep_EDFX,ST7241,ST724,n,24,M,48


In [8]:
all_demography_detail_df['File_Name'].str[:-1]

0       brux
1       brux
2        sdb
3        sdb
4        sdb
       ...  
300    ST721
301    ST722
302    ST722
303    ST724
304    ST724
Name: File_Name, Length: 305, dtype: object

### Get transition matrix or features information from transition probabilities and P/AUC information

In [9]:
f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2.csv"

'./Results//_Combined/Subject_One_Night/Annot_Proba_Transition2.csv'

In [10]:
annot_proba_transition2_feature_df = pd.read_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2.csv", index_col=False)
tran_proba_transition2_feature_df = pd.read_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Tran_Proba_Transition2.csv", index_col=False)

annot_proba_transition2_feature_df

Unnamed: 0,Dataset,Category,Subject_Name,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,S1->W,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,0.030769,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,0.825397,0.174603,0.000000,0.0,0.0,0.000000,0.061728,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,0.099237,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,0.948718,0.044872,0.006410,0.0,0.0,0.000000,0.011364,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,0.934272,0.046948,0.018779,0.0,0.0,0.000000,0.108696,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0.742857,0.242857,0.014286,0.0,0.0,0.000000,0.044118,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0.333333,0.555556,0.055556,0.0,0.0,0.055556,0.014493,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0.894737,0.099415,0.000000,0.0,0.0,0.005848,0.116667,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0.820000,0.180000,0.000000,0.0,0.0,0.000000,0.052133,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [11]:
tran_proba_transition2_feature_df

Unnamed: 0,Dataset,Category,Subject_Name,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,S1->W,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,0.0,0.952381,0.047619,0.0,0.0,0.000000,0.160000,...,0.166667,0.833333,0.0,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.0
1,CAP_Sleep,brux,brux2,0.0,1.000000,0.000000,0.0,0.0,0.000000,0.416667,...,0.000000,0.800000,0.0,0.0,0.800000,0.200000,0.000000,0.0,0.0,0.0
2,CAP_Sleep,sdb,sdb1,0.0,1.000000,0.000000,0.0,0.0,0.000000,0.419355,...,0.100000,0.600000,0.0,0.0,0.250000,0.000000,0.750000,0.0,0.0,0.0
3,CAP_Sleep,sdb,sdb2,0.0,0.857143,0.142857,0.0,0.0,0.000000,0.153846,...,0.250000,0.500000,0.0,0.0,0.000000,0.000000,1.000000,0.0,0.0,0.0
4,CAP_Sleep,sdb,sdb3,0.0,0.692308,0.307692,0.0,0.0,0.000000,0.500000,...,0.400000,0.600000,0.0,0.0,1.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0.0,0.941176,0.058824,0.0,0.0,0.000000,0.100000,...,0.071429,0.857143,0.0,0.0,0.000000,0.571429,0.428571,0.0,0.0,0.0
204,Sleep_EDFX,n,ST7201,0.0,0.833333,0.083333,0.0,0.0,0.083333,0.038462,...,0.058824,0.941176,0.0,0.0,0.666667,0.111111,0.222222,0.0,0.0,0.0
205,Sleep_EDFX,n,ST7211,0.0,0.941176,0.000000,0.0,0.0,0.058824,0.225806,...,0.062500,0.937500,0.0,0.0,0.571429,0.000000,0.428571,0.0,0.0,0.0
206,Sleep_EDFX,n,ST7221,0.0,1.000000,0.000000,0.0,0.0,0.000000,0.196429,...,0.000000,0.000000,0.0,0.0,0.333333,0.666667,0.000000,0.0,0.0,0.0


#### Prepare final dataset

In [111]:
class_name='Class'

dataset = annot_proba_transition2_feature_df.copy() 
dataset 

Unnamed: 0,Dataset,Category,Subject_Name,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,S1->W,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,0.030769,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,0.825397,0.174603,0.000000,0.0,0.0,0.000000,0.061728,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,0.099237,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,0.948718,0.044872,0.006410,0.0,0.0,0.000000,0.011364,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,0.934272,0.046948,0.018779,0.0,0.0,0.000000,0.108696,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0.742857,0.242857,0.014286,0.0,0.0,0.000000,0.044118,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0.333333,0.555556,0.055556,0.0,0.0,0.055556,0.014493,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0.894737,0.099415,0.000000,0.0,0.0,0.005848,0.116667,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0.820000,0.180000,0.000000,0.0,0.0,0.000000,0.052133,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [13]:
def map_age_category_to_class(dat_set, demo_det, source_cols=['Subject_Name', 'File_Name'], age_col = 'Age', age_ranges = [[19, 30], [31, 40]], class_name='Class', multi_class=True):
    tmp_df = dataset.merge(all_demography_detail_df, how='inner', left_on=[source_cols[0]], right_on=[source_cols[1]])
    tmp_df

    dat_set.insert(3, age_col, tmp_df[age_col].values) 
    dat_set

    dat_set = dat_set[ ((dat_set[age_col]>=age_ranges[0][0]) & (dat_set[age_col]<=age_ranges[0][1])) | ((dat_set[age_col]>=age_ranges[1][0]) & (dat_set[age_col]<=age_ranges[1][1])) ]  
    dat_set.reset_index(drop=True, inplace=True)

    tmp_age_range = [list(range(age_ranges[0][0], age_ranges[0][1]+1)), list(range(age_ranges[1][0], age_ranges[1][1]+1))] 
    tmp_age_range 

    cls_map = {} 
    for i, lst in enumerate(tmp_age_range): 
        for l in lst:
            cls_map[l]=i
    cls_map
    
    all_cols = dat_set.columns.values.tolist() 
    if (age_col in all_cols):
        if (class_name in all_cols): 
            dat_set = dat_set.drop([class_name], axis=1)
        dat_set = dat_set.rename(columns={age_col: class_name})
    
    all_cols2 = dat_set.columns.values.tolist() 
    if (class_name not in all_cols) and (class_name in all_cols2):
        dat_set.replace({class_name: cls_map}, inplace=True) 
    return cls_map, dat_set 

In [None]:
# label_map, dataset = map_age_category_to_class(dataset.copy(), all_demography_detail_df.copy(), source_cols=['Subject_Name', 'File_Name'], age_col = 'Age', age_ranges = [[19, 30], [31, 40]], class_name='Class', multi_class=True) ##19-101 
# print(label_map)
# dataset

In [112]:
def map_category_to_class(dat_set, source_col='Category', class_name='Class', removable_cats=None, multi_class=True): 
    if class_name in dat_set.columns.tolist():
        dat_set = dat_set.drop(columns=[class_name])
    dat_set.insert(3, class_name, dat_set[source_col].values) 
    dat_set

    cat_val = dat_set[source_col].unique().tolist() 
    cat_val.remove('n')
    cat_val.insert(0, 'n')
    print(cat_val) 
    
    if removable_cats:
        cat_val = [c for c in cat_val if c not in removable_cats]
        dat_set = dat_set[dat_set[source_col].isin(cat_val)]
        dat_set.reset_index(drop=True, inplace=True)
    print(cat_val) 
        
    cls_map = dict(zip(cat_val, list(range(len(cat_val))))) 
    cls_map
    
    if not multi_class:
        for k in cls_map.keys():
            if cls_map[k]>1:
                cls_map[k]=1

    dat_set.replace({class_name: cls_map}, inplace=True) 
    return cls_map, dat_set 
    
    
class_name = 'Class'
# label_map, dataset = map_category_to_class(dataset.copy(), source_col='Category', class_name=class_name, removable_cats=None, multi_class=True)
label_map, dataset = map_category_to_class(dataset.copy(), source_col='Category', class_name=class_name, removable_cats=None, multi_class=False)
# label_map, dataset = map_category_to_class(dataset.copy(), source_col='Category', class_name=class_name, removable_cats=['brux', 'sdb'], multi_class=True) 
print(label_map)
dataset

['n', 'brux', 'sdb', 'ins', 'narco', 'nfle', 'plm', 'rbd']
['n', 'brux', 'sdb', 'ins', 'narco', 'nfle', 'plm', 'rbd']
{'n': 0, 'brux': 1, 'sdb': 1, 'ins': 1, 'narco': 1, 'nfle': 1, 'plm': 1, 'rbd': 1}


Unnamed: 0,Dataset,Category,Subject_Name,Class,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.825397,0.174603,0.000000,0.0,0.0,0.000000,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.948718,0.044872,0.006410,0.0,0.0,0.000000,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.934272,0.046948,0.018779,0.0,0.0,0.000000,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.742857,0.242857,0.014286,0.0,0.0,0.000000,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.333333,0.555556,0.055556,0.0,0.0,0.055556,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.894737,0.099415,0.000000,0.0,0.0,0.005848,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.820000,0.180000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [113]:
dataset['Subject_Name'].unique().shape

(208,)

#### Accessing P/AUC for binary class for annotation probability of transition-2 - annot

In [16]:
f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_PAUC_bin.csv"

'./Results//_Combined/Subject_One_Night/Annot_Proba_Transition2_PAUC_bin.csv'

In [17]:
annot_proba_transition2_PAUC_bin_df = pd.read_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_PAUC_bin.csv", index_col=False)
annot_proba_transition2_PAUC_bin_df 

Unnamed: 0,Features,P_Value_bin,AUC_bin
0,W->W,0.006607903,0.761807
1,W->S1,0.08354123,0.286638
2,W->S2,7.039615e-07,0.73824
3,W->S3,0.03838355,0.506793
4,W->S4,0.2624921,0.505435
5,W->REM,0.091735,0.574869
6,S1->W,5.993076e-12,0.768132
7,S1->S1,3.565621e-09,0.729198
8,S1->S2,0.002594834,0.612303
9,S1->S3,0.3712097,0.510682


In [18]:
def get_selected_feature_list_based_on_PAUC(tmp_df, p_threshold=0.05, auc_threshold=0.5, sort=False): 
    cols = tmp_df['Features'].values.tolist() 
    tmp_df = tmp_df[(tmp_df['P_Value_bin']<p_threshold) & (tmp_df['AUC_bin']>=auc_threshold)]
    if sort:
        tmp_df = tmp_df.sort_values(['P_Value_bin', 'AUC_bin'], ascending = [True, False])
    selected_features = tmp_df['Features'].values.tolist() 
    return selected_features

In [19]:
sorted_PAUC_df = get_selected_feature_list_based_on_PAUC(annot_proba_transition2_PAUC_bin_df.copy(), p_threshold=0.05, auc_threshold=0.5) 
sorted_PAUC_df 

['W->W',
 'W->S2',
 'W->S3',
 'S1->W',
 'S1->S1',
 'S1->S2',
 'S1->REM',
 'S2->S1',
 'S2->S2',
 'S2->S3',
 'S2->REM',
 'S3->S2',
 'S3->S3',
 'S3->S4',
 'S4->S3',
 'S4->S4',
 'REM->W',
 'REM->S1',
 'REM->REM']

In [20]:
sorted_PAUC_df = get_selected_feature_list_based_on_PAUC(annot_proba_transition2_PAUC_bin_df.copy(), p_threshold=0.05, auc_threshold=0.7) 
sorted_PAUC_df 

['W->W',
 'W->S2',
 'S1->W',
 'S1->S1',
 'S1->REM',
 'S2->S1',
 'S2->S2',
 'S2->S3',
 'S3->S2',
 'S3->S3',
 'S4->S4']

#### Statistical analysis with Wilcoxon rank-sum test and Mann-Whitney U test

In [22]:
from scipy import stats

In [114]:
class_name = 'Class'

dataset

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.825397,0.174603,0.000000,0.0,0.0,0.000000,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.948718,0.044872,0.006410,0.0,0.0,0.000000,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.934272,0.046948,0.018779,0.0,0.0,0.000000,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.742857,0.242857,0.014286,0.0,0.0,0.000000,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.333333,0.555556,0.055556,0.0,0.0,0.055556,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.894737,0.099415,0.000000,0.0,0.0,0.005848,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.820000,0.180000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [115]:
# results = stats.ranksums(x, y)
# results = stats.mannwhitneyu(x, y, use_continuity=True, alternative=None)

def get_statistical_significant_using_wilcoxon_and_mannwhitney_u_test(tmp_df, class_name): 
    feats = tmp_df.columns.values.tolist()[4:] 

    stat_significace_df = pd.DataFrame(columns=['Features', 'Wilcoxon_zscore', 'Wilcoxon_pvalue', 'MannWhitney_statistic', 'MannWhitney_pvalue']) 

    for f in feats:
        fd_0 = tmp_df[ (tmp_df[class_name]==0) ][f].values
        fd_1 = tmp_df[ (tmp_df[class_name]==1) ][f].values
    #     print("==>", f, fd_0.shape, fd_1.shape)
    #     print("==>", f, fd_0.shape, fd_1.shape, np.mean(fd_0), np.mean(fd_1))
        stat_value, p_value = stats.ranksums(fd_0, fd_1)
    #     stat_value2, p_value2 = stats.mannwhitneyu(fd_0, fd_1, use_continuity=True, alternative=None) 
    #     stat_value2, p_value2 = 0.0, 1.0 if ( (np.mean(fd_0)==0) and (np.mean(fd_0)==np.mean(fd_1))==True) else stats.mannwhitneyu(fd_0, fd_1, use_continuity=True, alternative=None) 
        stat_value2, p_value2 = 0.0, 1.0 
        if not ( (np.mean(fd_0)==0) and (np.mean(fd_0)==np.mean(fd_1))==True):
            res = stats.mannwhitneyu(fd_0, fd_1, use_continuity=True, alternative=None) 
            stat_value2, p_value2 = res.statistic, res.pvalue 
    #     print(f, fd_0.shape, fd_1.shape, stat_value, p_value, stat_value2, p_value2) 
    #     print(f, fd_0.shape, fd_1.shape, stat_value, p_value, stat_value2, p_value2, p_value<0.05, p_value2<0.05, ((p_value<0.05)==(p_value2<0.05)) ) 
    #     print(f, p_value2<0.05, ((p_value<0.05)==(p_value2<0.05)) ) 
        new_row = {'Features':f, 'Wilcoxon_zscore':stat_value, 'Wilcoxon_pvalue':p_value, 'MannWhitney_statistic':stat_value2, 'MannWhitney_pvalue':p_value2}
        stat_significace_df = stat_significace_df.append(new_row, ignore_index=True)
    #     tdf = pd.DataFrame(new_row) 
    #     stat_significace_df = pd.concat([stat_significace_df, tdf]) 

    return stat_significace_df


stat_significace_df = get_statistical_significant_using_wilcoxon_and_mannwhitney_u_test(dataset, class_name)
stat_significace_df

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
1,W->S1,-5.281504,1.281278e-07,3059.0,6.446579e-08
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
3,W->S3,-0.168164,0.8664544,5263.5,0.3261662
4,W->S4,-0.134531,0.8929827,5278.0,0.1328103
5,W->REM,1.853281,0.06384205,4537.0,0.003832078
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
8,S1->S2,2.779922,0.005437196,4137.5,0.002727311
9,S1->S3,0.264423,0.7914539,5222.0,0.1997063


In [116]:
# stat_significace_df.to_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_Wilcoxon_MannWitney_UTest_bin.csv", index=False) 
# result_save_path, f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_Wilcoxon_MannWitney_UTest_bin.csv"
f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_Wilcoxon_MannWitney_UTest_bin.csv"

'./Results//_Combined/Subject_One_Night/Annot_Proba_Transition2_Wilcoxon_MannWitney_UTest_bin.csv'

In [117]:
stat_significace_df = pd.read_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_Wilcoxon_MannWitney_UTest_bin.csv", index_col=False)
stat_significace_df 

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
1,W->S1,-5.281504,1.281278e-07,3059.0,6.446579e-08
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
3,W->S3,-0.168164,0.8664544,5263.5,0.3261662
4,W->S4,-0.134531,0.8929827,5278.0,0.1328103
5,W->REM,1.853281,0.06384205,4537.0,0.003832078
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
8,S1->S2,2.779922,0.005437196,4137.5,0.002727311
9,S1->S3,0.264423,0.7914539,5222.0,0.1997063


In [118]:
wil_p = stat_significace_df[ (stat_significace_df['Wilcoxon_pvalue']<0.05) ]
wil_p

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
1,W->S1,-5.281504,1.281278e-07,3059.0,6.446579e-08
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
8,S1->S2,2.779922,0.005437196,4137.5,0.002727311
11,S1->REM,5.15857,2.488427e-07,3112.0,1.758797e-08
12,S2->W,4.04521,5.227628e-05,3592.0,2.62541e-05
13,S2->S1,7.460675,8.608028e-14,2119.5,3.876329e-14
14,S2->S2,-8.307293,9.791000000000001e-17,1754.5,4.943446000000001e-17


In [119]:
mann_p = stat_significace_df[ (stat_significace_df['MannWhitney_pvalue']<0.05) ]
mann_p

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
1,W->S1,-5.281504,1.281278e-07,3059.0,6.446579e-08
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
5,W->REM,1.853281,0.06384205,4537.0,0.003832078
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
8,S1->S2,2.779922,0.005437196,4137.5,0.002727311
11,S1->REM,5.15857,2.488427e-07,3112.0,1.758797e-08
12,S2->W,4.04521,5.227628e-05,3592.0,2.62541e-05
13,S2->S1,7.460675,8.608028e-14,2119.5,3.876329e-14


In [120]:
wil_p = wil_p.sort_values('Wilcoxon_pvalue', ascending=True)
wil_p

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
21,S3->S3,-10.540972,5.591765e-26,791.5,2.743893e-26
28,S4->S4,-9.121438,7.413415e-20,1403.5,1.7708149999999998e-20
20,S3->S2,8.492853,2.0162520000000003e-17,1674.5,1.011512e-17
14,S2->S2,-8.307293,9.791000000000001e-17,1754.5,4.943446000000001e-17
13,S2->S1,7.460675,8.608028e-14,2119.5,3.876329e-14
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
1,W->S1,-5.281504,1.281278e-07,3059.0,6.446579e-08


In [121]:
mann_p = mann_p.sort_values('MannWhitney_pvalue', ascending=True)
mann_p

Unnamed: 0,Features,Wilcoxon_zscore,Wilcoxon_pvalue,MannWhitney_statistic,MannWhitney_pvalue
21,S3->S3,-10.540972,5.591765e-26,791.5,2.743893e-26
28,S4->S4,-9.121438,7.413415e-20,1403.5,1.7708149999999998e-20
20,S3->S2,8.492853,2.0162520000000003e-17,1674.5,1.011512e-17
14,S2->S2,-8.307293,9.791000000000001e-17,1754.5,4.943446000000001e-17
13,S2->S1,7.460675,8.608028e-14,2119.5,3.876329e-14
6,S1->W,6.637252,3.195845e-11,2474.5,1.49354e-11
0,W->W,6.480686,9.130657e-11,2542.0,4.599963e-11
2,W->S2,-5.897331,3.694274e-09,2793.5,2.485725e-10
7,S1->S1,-5.6735,1.399095e-08,2890.0,7.037414e-09
11,S1->REM,5.15857,2.488427e-07,3112.0,1.758797e-08


In [122]:
wil_p.shape, mann_p.shape

((20, 5), (23, 5))

In [123]:
print(wil_p.Features.values.tolist()) 

['S3->S3', 'S4->S4', 'S3->S2', 'S2->S2', 'S2->S1', 'S1->W', 'W->W', 'W->S2', 'S1->S1', 'W->S1', 'S1->REM', 'S2->S3', 'S2->REM', 'REM->S1', 'REM->REM', 'S2->W', 'S4->W', 'S4->S2', 'S1->S2', 'REM->W']


In [124]:
print(mann_p.Features.values.tolist()) 

['S3->S3', 'S4->S4', 'S3->S2', 'S2->S2', 'S2->S1', 'S1->W', 'W->W', 'W->S2', 'S1->S1', 'S1->REM', 'W->S1', 'S2->S3', 'REM->S1', 'S2->REM', 'REM->REM', 'S2->W', 'S4->W', 'S4->S2', 'S1->S2', 'W->REM', 'REM->W', 'S3->S1', 'S2->S4']


#### Mean-SD of dataset

In [125]:
dataset

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.825397,0.174603,0.000000,0.0,0.0,0.000000,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.948718,0.044872,0.006410,0.0,0.0,0.000000,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.934272,0.046948,0.018779,0.0,0.0,0.000000,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.742857,0.242857,0.014286,0.0,0.0,0.000000,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.333333,0.555556,0.055556,0.0,0.0,0.055556,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.894737,0.099415,0.000000,0.0,0.0,0.005848,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.820000,0.180000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [126]:
feat_names = dataset.columns.values[4:]
feat_names

array(['W->W', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W',
       'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W',
       'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W',
       'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W',
       'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W',
       'REM->S1', 'REM->S2', 'REM->S3', 'REM->S4', 'REM->REM'],
      dtype=object)

In [127]:
study1_feature_statistics_df = None
study1_feature_statistics_df = pd.DataFrame(columns=['Feature', 'Sum', 'Mean', 'STD', 'P-value']) 
for f in feat_names:
#     print(f, '=== ', round(dataset[f].sum(),2), '||', round(dataset[f].mean(),2), '+/-', round(dataset[f].std(),2), round(stat_significace_df[(stat_significace_df['Features']==f)]['Wilcoxon_pvalue'],6)) # stat_significace_df | Features	Wilcoxon_zscore	Wilcoxon_pvalue
    study1_feature_statistics_df.loc[(study1_feature_statistics_df.shape[0])] = [f, round(dataset[f].sum(),6), round(dataset[f].mean(),3), round(dataset[f].std(),3), round(stat_significace_df[(stat_significace_df['Features']==f)]['Wilcoxon_pvalue'].values[0],6)] 

study1_feature_statistics_df

Unnamed: 0,Feature,Sum,Mean,STD,P-value
0,W->W,192.343048,0.925,0.095,0.0
1,W->S1,12.936016,0.062,0.08,0.0
2,W->S2,2.315395,0.011,0.025,0.0
3,W->S3,0.09543,0.0,0.003,0.866454
4,W->S4,0.008065,0.0,0.001,0.892983
5,W->REM,0.302046,0.001,0.007,0.063842
6,S1->W,12.773324,0.061,0.05,0.0
7,S1->S1,151.035246,0.726,0.137,0.0
8,S1->S2,40.67152,0.196,0.11,0.005437
9,S1->S3,0.149284,0.001,0.006,0.791454


In [129]:
# study1_feature_statistics_df.to_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_feature_statistics_bin.csv", index=False) 
f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_feature_statistics_bin.csv"

'./Results//_Combined/Subject_One_Night/Annot_Proba_Transition2_feature_statistics_bin.csv'

In [130]:
study1_feature_statistics_df = pd.read_csv(f"{root_directory}{data_directory}/{data_subdirectory}/Annot_Proba_Transition2_feature_statistics_bin.csv", index_col=False)
study1_feature_statistics_df 

Unnamed: 0,Feature,Sum,Mean,STD,P-value
0,W->W,192.343048,0.925,0.095,0.0
1,W->S1,12.936016,0.062,0.08,0.0
2,W->S2,2.315395,0.011,0.025,0.0
3,W->S3,0.09543,0.0,0.003,0.866454
4,W->S4,0.008065,0.0,0.001,0.892983
5,W->REM,0.302046,0.001,0.007,0.063842
6,S1->W,12.773324,0.061,0.05,0.0
7,S1->S1,151.035246,0.726,0.137,0.0
8,S1->S2,40.67152,0.196,0.11,0.005437
9,S1->S3,0.149284,0.001,0.006,0.791454


In [131]:
dataset

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.825397,0.174603,0.000000,0.0,0.0,0.000000,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.948718,0.044872,0.006410,0.0,0.0,0.000000,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.934272,0.046948,0.018779,0.0,0.0,0.000000,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.742857,0.242857,0.014286,0.0,0.0,0.000000,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.333333,0.555556,0.055556,0.0,0.0,0.055556,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.894737,0.099415,0.000000,0.0,0.0,0.005848,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.820000,0.180000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [76]:
def get_features_with_zero_values(tmp_df):
    feat_names = tmp_df.columns.values[4:] 
    feat_names
    zero_feats = [] 
    for f in feat_names:
        if tmp_df[f].values.sum() == 0:
            zero_feats.append(f)
    return zero_feats

In [77]:
zero_feats = get_features_with_zero_values(dataset)
zero_feats

['REM->S4']

# Classification 

In [None]:
# from HumachLab_ML_CLassifiers import * 
# import HumachLab_ML_CLassifiers 

In [78]:
def create_experiment_directory(path, exp_name): 
    exp_directory = f"{path}/{exp_name}/"

    if (not os.path.exists(exp_directory)):
        try:
            os.makedirs(exp_directory, exist_ok = True)
            print(f"Directory successfully created at path: {exp_directory}") 
        except OSError as error:
            print(f"Directory cannot be created at path: {exp_directory}") 
    else:
        print(f"Directory already exists at path: {exp_directory}") 

    return exp_directory

In [79]:
ML_Classifiers.SVC

<ML_Classifiers.SVC: 'support_vector_classifier'>

In [132]:
dataset

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->W,W->S1,W->S2,W->S3,W->S4,W->REM,...,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->S4,REM->REM
0,CAP_Sleep,brux,brux1,1,0.800000,0.190476,0.009524,0.0,0.0,0.000000,...,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.825397,0.174603,0.000000,0.0,0.0,0.000000,...,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.876712,0.123288,0.000000,0.0,0.0,0.000000,...,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.948718,0.044872,0.006410,0.0,0.0,0.000000,...,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.934272,0.046948,0.018779,0.0,0.0,0.000000,...,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.742857,0.242857,0.014286,0.0,0.0,0.000000,...,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.333333,0.555556,0.055556,0.0,0.0,0.055556,...,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.894737,0.099415,0.000000,0.0,0.0,0.005848,...,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.820000,0.180000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.0,0.935065


In [None]:
# sys.path

In [None]:
# !dir

In [81]:
__name__, os.getcwd()

('__main__',
 'C:\\Users\\aliem\\Desktop\\aliem\\My Research\\HML_IHC_Sleep_Data_Analysis')

In [82]:
def start_logger(result_save_path, exp_name): 
    util = Humachlab_Utility() 
    all_log_file_name = f'{result_save_path}/all_logs_{exp_name}.txt'
    # logger = util.get_logger(logger_name=__name__, log_file_name=all_log_file_name)
    logger = util.get_logger(logger_name="Sleep ML Model Analysis", log_file_name=all_log_file_name)
    util, all_log_file_name, logger 
    return util, logger

In [83]:
def stop_logger(logger): 
    handlers = logger.handlers[:]
    for handler in handlers:
        logger.removeHandler(handler)
        handler.close()

In [389]:
def modify_experiment_information_summarry(exp_dir, dict_dat=None):
    exp_sum_dir = f'{exp_dir}/Experiment_Information.csv'
    df = pd.DataFrame(columns=['exp_name', 'exp_description', 'datasets', 'feature_selection', 'special_consideration', 'classification_type'])
    if (os.path.exists(exp_sum_dir)):
        df = pd.read_csv(exp_sum_dir, index_col=False) 
    if dict_dat:
        if df.shape[0]>0:
            print(dict_dat['exp_name'], (df['exp_name'].values.tolist()))
            if dict_dat['exp_name'] in (df['exp_name'].values.tolist()):
                # nn = 1 
                # print(f'Cannot add this data. this experiment is already exited. Please recheck and do a new experiment')
                # assert nn < 0, f'Cannot add this data. this experiment is already exited. Please recheck and do a new experiment' 
                print(f'This experiment is already exited. So this data is removed...')
                df = df.loc[df["exp_name"] != dict_dat['exp_name']] 
                df.reset_index(drop=True, inplace=True) 
                
        df.loc[len(df)] = dict_dat
        df.sort_values(by=['exp_name'], ascending=[True], inplace=True)
        df.reset_index(drop=True, inplace=True) 
        df.to_csv(exp_sum_dir, index=False) 
        print(f'Data is successfully inserted...')
    return df

exp_sum_df = modify_experiment_information_summarry(result_directory) 
exp_sum_df 

# exp_sum_df = modify_experiment_information_summarry(result_directory, dict_dat=exp_detail) 
# exp_sum_df 

# exp_sum_df = modify_experiment_information_summarry(result_directory) 
# exp_sum_df 

Unnamed: 0,exp_name,exp_description,datasets,feature_selection,special_consideration,classification_type
0,ML001,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,No special consideration,Binary classification: wake vs sleep
1,ML002,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,W->W stage transition removed,Binary classification: wake vs sleep
2,ML003,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",AUC based selection with AUC>0.7,No special consideration,Binary classification: wake vs sleep
3,ML004,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",AUC based selection with AUC>0.7,W->W stage transition removed,Binary classification: wake vs sleep
4,ML005,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,No special consideration,Binary classification: wake vs sleep
5,ML006,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,W->W stage transition removed,Binary classification: wake vs sleep
6,ML007,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",AUC based >0.7,No special consideration,Binary classification: wake vs sleep
7,ML008,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",AUC based >0.7,W->W stage transition removed,Binary classification: wake vs sleep
8,ML009,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,nxx balanced over the folds,Binary classification: wake vs sleep
9,ML010,Wake vs sleep binary classification using slee...,"2 datasets- CAP_Sleep, Sleep_EDFX",No feature selection,nxx balanced over the folds | W->W stage trans...,Binary classification: wake vs sleep


In [88]:
# pd.read_csv?
# exp_detail
# exp_name

In [390]:
exp_name = 'ML1001' 
exp_detail = {'exp_name':exp_name, 'exp_description':'Wake vs all disorders multi-class classification using sleep transition matrix', 'datasets':'2 datasets- CAP_Sleep, Sleep_EDFX', 
              'feature_selection':'No feature selection', 'special_consideration':'Remove W->W and all zero transitions, ', 
              'classification_type':'Binary classification: Healthy vs disordered'}

# exp_detail = {'exp_name':exp_name, 'exp_description':'Wake vs all disorders multi-class classification using sleep transition matrix', 'datasets':'2 datasets- CAP_Sleep, Sleep_EDFX', 
#               'feature_selection':'No feature selection', 'special_consideration':'No special consideration', 
#               'classification_type':'Multi-class classification: Healthy vs 7 different disordered'}
# exp_detail = {'exp_name':exp_name, 'exp_description':'Wake vs sleep binary classification using sleep transition matrix', 'datasets':'2 datasets- CAP_Sleep, Sleep_EDFX', 
#               'feature_selection':'AAUC based training set only feature selection with AUC>=0.7', 'special_consideration':'W->W stage transition removed, imbalanced over the folds', 
#               'classification_type':'Binary classification: Healthy vs disordered'} 
# exp_detail = {'exp_name':exp_name, 'exp_description':'Wake vs sleep binary classification using sleep transition matrix', 'datasets':'2 datasets- CAP_Sleep, Sleep_EDFX', 
#               'feature_selection':'No feature selection', 'special_consideration':'No special consideration, nxx balanced over the folds', 'classification_type':'Binary classification: Healthy vs disordered'} 
# exp_detail = {'exp_name':exp_name, 'exp_description':'Wake vs sleep binary classification using sleep transition matrix', 'datasets':'2 datasets- CAP_Sleep, Sleep_EDFX', 
#               'feature_selection':'AUC based >0.7', 'special_consideration':'W->W stage transition removed, nxx balanced over the folds', 'classification_type':'Binary classification: Healthy vs disordered'} 
result_save_path = create_experiment_directory(result_directory, exp_name)
result_save_path 

Directory successfully created at path: ./Results/_Classification/ML1001/


'./Results/_Classification/ML1001/'

In [331]:
# logger.info("Hello")

In [391]:
random_state_value = 312
class_name = "Class" 
metadata_column = ["Dataset", "Category", "Subject_Name"] 
all_metadata_columns = metadata_column+[class_name]
# ### #Binary/Multi-class healthy vs disorders 
split_column = "Subject_Name"  #"Subject_Name" for binary or multi-class 
split_balance_pattern = [['n'], ['SC', 'ST']] # [['n'], ['SC', 'ST'], ['brux'], ['sdb'], ['ins'], ['narco'], ['nfle'], ['plm'], ['rbd']] #[['n'], ['SC', 'ST']]  for binary, [['n'], ['SC', 'ST'], ['brux'], ['sdb'], ['ins'], ['narco'], ['nfle'], ['plm'], ['rbd']] for multi-class 
# ### #Binary/Multi-class age-group detection  
# split_column = class_name
# split_balance_pattern = [[1]]
class_name, metadata_column, split_column, all_metadata_columns, result_save_path


('Class',
 ['Dataset', 'Category', 'Subject_Name'],
 'Subject_Name',
 ['Dataset', 'Category', 'Subject_Name', 'Class'],
 './Results/_Classification/ML1001/')

In [392]:
# processed_dataset = dataset.copy() if not random_state_value else dataset.copy().sample(frac=1, random_state=random_state_value).reset_index(drop=True) 
processed_dataset = dataset.copy()
zero_feats = get_features_with_zero_values(processed_dataset)
zero_feats
removable_feats = ['W->W']
removable_feats.extend(zero_feats)
removable_feats 
# processed_dataset = processed_dataset[all_metadata_columns+sorted_PAUC_df]
processed_dataset = processed_dataset.drop(removable_feats, axis=1)

# ### #Binary/Multi-class healthy vs disorders 
# # processed_dataset = processed_dataset[~processed_dataset['Category'].isin(['brux', 'sdb'])]### Brux and sdb is cancelled coz of low number to fit in 5 fold
# # class_map, processed_dataset = map_category_to_class(processed_dataset.copy(), source_col='Category', class_name='Class', removable_cats=None, multi_class=True)
# # label_map, processed_dataset = map_category_to_class(processed_dataset.copy(), source_col='Category', class_name='Class', removable_cats=['brux', 'sdb', 'narco', 'ins', 'plm'], multi_class=True) 
# label_map, processed_dataset = map_category_to_class(processed_dataset.copy(), source_col='Category', class_name='Class', removable_cats=['brux', 'sdb', 'narco', 'ins', 'plm'], multi_class=True) 
# print(label_map)
# # processed_dataset = processed_dataset[~processed_dataset['Subject_Name'].str.startswith('SC')] 
# processed_dataset = processed_dataset[~processed_dataset['Subject_Name'].str.startswith('ST')] 

# ### #Binary/Multi-class age-group detection  
# label_map, processed_dataset = map_age_category_to_class(processed_dataset.copy(), all_demography_detail_df.copy(), source_cols=['Subject_Name', 'File_Name'], age_col = 'Age', age_ranges = [[19, 30], [31, 40]], class_name='Class', multi_class=True) ##19-101 
# print(label_map)
# processed_dataset

processed_dataset 

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->S1,W->S2,W->S3,W->S4,W->REM,S1->W,...,S4->S1,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->REM
0,CAP_Sleep,brux,brux1,1,0.190476,0.009524,0.0,0.0,0.000000,0.030769,...,0.000000,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.174603,0.000000,0.0,0.0,0.000000,0.061728,...,0.000000,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.123288,0.000000,0.0,0.0,0.000000,0.099237,...,0.010152,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.044872,0.006410,0.0,0.0,0.000000,0.011364,...,0.000000,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.046948,0.018779,0.0,0.0,0.000000,0.108696,...,0.000000,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.242857,0.014286,0.0,0.0,0.000000,0.044118,...,0.013889,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.555556,0.055556,0.0,0.0,0.055556,0.014493,...,0.000000,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.099415,0.000000,0.0,0.0,0.005848,0.116667,...,0.000000,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.180000,0.000000,0.0,0.0,0.000000,0.052133,...,0.000000,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.935065


In [334]:
# processed_dataset[~processed_dataset['Subject_Name'].str.startswith('SC')].groupby('Category')['Class'].value_counts() 

In [393]:
processed_dataset['Class'].unique()
processed_dataset.groupby('Category')['Class'].value_counts()
# processed_dataset['Category'].unique().tolist()

Category  Class
brux      1          2
ins       1          9
n         0        116
narco     1          5
nfle      1         40
plm       1         10
rbd       1         22
sdb       1          4
Name: Class, dtype: int64

In [394]:
if logger:
    stop_logger(logger) 
util, logger = start_logger(result_save_path, exp_name)

# classifier_obj = HumachLab_ML_CLassifiers(logger=logger, directory=result_save_path, dataset=dataset.copy(), class_name=class_name, metadata_column=metadata_column, split_column=split_column) 
classifier_obj = HumachLab_ML_CLassifiers(logger=logger, directory=result_save_path, dataset=processed_dataset.copy(), class_name=class_name, label_map=label_map, metadata_column=metadata_column, split_column=split_column, random_state_value=random_state_value, split_balance_pattern=split_balance_pattern) 

classifier_obj 



        Object is initialised with the following properties: 
        ###################################################################################################
        Dataset size: (208, 38), Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        Target class column name: Class
        Metadata column names: ['Dataset', 'Category', 'Subject_Name']
        Dataset split column on which the training and test sets will be devided: Subject_Name
        Is multi-class classification: False
        


<__main__.HumachLab_ML_CLassifiers at 0x1961b1a1c88>

In [395]:
classifier_obj.print_message()

Hello from HumachLab_ML_CLassifiers class


In [396]:
# splitting_crieteria = [(10, 0), (5, 20)]     ### for test & training (validation) splitting_crieteria (m, n)-m folds, n%:  m=0: loso, m>0: m-fold, (n>0 given m>0) -shuffled random splitting with m-fold, n% testing 
splitting_crieteria = [(5, 0), (5, 20)]     ### for test & training (validation) splitting_crieteria (m, n)-m folds, n%:  m=0: loso, m>0: m-fold, (n>0 given m>0) -shuffled random splitting with m-fold, n% testing 
model_list = [ML_Classifiers.LogReg, ML_Classifiers.SVC, ML_Classifiers.NB, ML_Classifiers.kNN, ML_Classifiers.DT, ML_Classifiers.RF, ML_Classifiers.GBoost] # [ML_Classifiers.LogReg, ML_Classifiers.SVC, ML_Classifiers.NB, ML_Classifiers.kNN, ML_Classifiers.DT, ML_Classifiers.RF, ML_Classifiers.GBoost] 
should_use_params = True 
is_validate_models = True
# is_binary_classification = False 
apply_feature_selection = False  
custom_splitter = False
exp_name = exp_name

splitting_crieteria, model_list, result_save_path, should_use_params 

([(5, 0), (5, 20)],
 [<ML_Classifiers.LogReg: 'logistic_regression'>,
  <ML_Classifiers.SVC: 'support_vector_classifier'>,
  <ML_Classifiers.NB: 'naive_bayes'>,
  <ML_Classifiers.kNN: 'k_nearest_neighbors'>,
  <ML_Classifiers.DT: 'decision_tree'>,
  <ML_Classifiers.RF: 'random_forest'>,
  <ML_Classifiers.GBoost: 'gradient_boosting'>],
 './Results/_Classification/ML1001/',
 True)

In [397]:
classifier_obj.class_name, classifier_obj.split_column

('Class', 'Subject_Name')

In [None]:
### Set the classifier parameters in the "HumachLab_ML_CLassifiers" class file to run with the parameter 
# best_tr_model, tr_model, tr_model_scores_df, tr_target_and_prediction_df, ts_model, ts_model_scores_df, ts_target_and_prediction_df, ts_fold_info_df, exp_info_df
best_tr_model, tr_model, tr_model_scores_df, tr_target_and_prediction_df, ts_model, ts_model_scores_df, ts_target_and_prediction_df, ts_fold_info_df, exp_info_df = classifier_obj.classify(
    should_use_params=should_use_params, splitting_crieteria=splitting_crieteria, model_list=model_list, is_validate_models=is_validate_models, 
    result_save_path=result_save_path, exp_name=exp_name, exp_detail=exp_detail, apply_feature_selection=apply_feature_selection, custom_splitter=custom_splitter) 

stop_logger(logger) 

exp_sum_df = modify_experiment_information_summarry(result_directory, dict_dat=exp_detail) 
exp_sum_df 


        Classification is set with the following parameters: 
        ###################################################################################################
        Splitting crieteria: [(5, 0), (5, 20)]
        Test split: 5-fold cross validation
        Training split: 5-fold 20% random test splitting
        List of ML models that will be applied: ['logistic_regression', 'support_vector_classifier', 'naive_bayes', 'k_nearest_neighbors', 'decision_tree', 'random_forest', 'gradient_boosting']
        Use parameters for model: True
        Is validate the model (or only train): True 
        Classification results will be saved in the directory: ./Results/_Classification/ML1001/
        
5-fold testing

            ### MODEL TEST PHASE 
            TEST 1 START... XXXXX 
            Test=> 42 ['brux1', 'brux2', 'sdb1', 'sdb2', 'sdb3', 'sdb4', 'ins1', 'ins2', 'ins3', 'ins4', 'ins5', 'ins6', 'ins7', 'ins8', 'ins9', 'narco1', 'narco2', 'narco3', 'narco4', 'n1', 'n2', 'n3', '

 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.84462064        nan 0.84462064        nan 0.84462064        nan
 0.84462064        nan 0.84462064        nan 0.84462064        nan
 0.86374248        nan 0.86374248        nan 0.86374248        nan
 0.86374248        nan 0.86374248        nan 0.86374248        nan
 0.89258609        nan 0.89258609        nan 0.89258609        nan
 0.89258609        nan 0.89258609        nan 0.89258609        nan
 0.90909403        nan 0.90909403        nan 0.90909403        nan
 0.90909403        nan 0.90909403        nan 0.90909403        nan]
 0.71365079        nan 0.71365079        nan 0.71365079        nan
 0.71365079        nan 0.71365079        nan 0.71365079      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [ 63  11 143 139 129  42  75  48  93  44 131 120  59 156  10  73 105  34
 163  32]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1] [1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 7  2]
 [ 0 11]]
[[11  0  2  7]
 [ 7  2  0 11]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precisio

 0.90910279 0.88969697 0.89107376 0.91417339 0.89462667 0.90468487
 0.91792017 0.92250361 0.90864528 0.90173102 0.89690285 0.91567763
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]
 1.         1.         1.         0.92571268 0.93523169 0.98567067
 1.         1.         1.         1.         1.         1.
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


Fitting 5 folds for each of 243 candidates, totalling 1215 fits

        From training? False, Data shape: (20, 38), Indices: [ 63  11 143 139 129  42  75  48  93  44 131 120  59 156  10  73 105  34
 163  32]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1] [1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 7  2]
 [ 0 11]]
[[11  0  2  7]
 [ 7  2  0 11]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precis

 0.92059051 0.92059051 0.92059051 0.87811858 0.93167276 0.91417339
 0.90735521 0.91417339 0.92059051 0.92059051 0.92059051 0.92059051
 0.91289194 0.87284271 0.93677786 0.9070821  0.91955155 0.92059051
 0.92059051 0.92059051 0.91420395 0.89934641 0.84984403 0.88766017
 0.90468014 0.92465665 0.93107376 0.92059051 0.91420395 0.90778684
 0.91780303 0.88598967 0.89370003 0.9078174  0.91417339 0.9050348
 0.92525565 0.91289194 0.90778684 0.83581594 0.90237287 0.889266
 0.9306338  0.91313443 0.92468721 0.92525565 0.91417339 0.91417339
 0.83591083 0.89102313 0.90339738 0.91289194 0.91185298 0.91289194
 0.92596866 0.9192785  0.91289194 0.88428317 0.92523024 0.88757448
 0.9192785  0.91420395 0.91253544 0.91417339 0.90778684 0.92059051
 0.89100312 0.89341887 0.90408964 0.92465665 0.92364825 0.91930906
 0.91955155 0.91420395 0.91289194 0.90428922 0.90273803 0.92059051
 0.92059051 0.91417339 0.92059051 0.92059051 0.92059051 0.92059051
 0.8874297  0.90805507 0.91417339 0.91420395 0.91417339 0.9141733

Fitting 5 folds for each of 405 candidates, totalling 2025 fits

        From training? False, Data shape: (20, 38), Indices: [ 63  11 143 139 129  42  75  48  93  44 131 120  59 156  10  73 105  34
 163  32]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1] [1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0]
[[7 2]
 [2 9]]
[[9 2 2 7]
 [7 2 2 9]]
[[16  4]
 [ 4 16]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 80.0
 Precision = [0.818

 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.84220484        nan 0.84220484        nan 0.84220484        nan
 0.84220484        nan 0.84220484        nan 0.84220484        nan
 0.87678927        nan 0.87678927        nan 0.87678927        nan
 0.87678927        nan 0.87678927        nan 0.87678927        nan
 0.88295827        nan 0.88295827        nan 0.88295827        nan
 0.88295827        nan 0.88295827        nan 0.88295827        nan
 0.90532819        nan 0.90532819        nan 0.90532819        nan
 0.90532819        nan 0.90532819        nan 0.90532819        nan]
 0.71929044        nan 0.71929044        nan 0.71929044        nan
 0.71929044        nan 0.71929044        nan 0.71929044      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [132 151 147 134  29  34 140  67  27  60  90 109   7 161  56  46 110 105
 145  71]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1] [0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1]
[[ 9  1]
 [ 0 10]]
[[10  0  1  9]
 [ 9  1  0 10]]
[[19  1]
 [ 1 19]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 95.0
 Precisio

 0.85976891 0.85032193 0.85976891 0.91100512 0.81422342 0.86712866
 0.87259486 0.8671876  0.85826166 0.85976891 0.87259486 0.86617774
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]
 1.         1.         1.         0.92139964 0.93435031 0.99071952
 1.         1.         1.         1.         1.         1.
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


Fitting 5 folds for each of 243 candidates, totalling 1215 fits

        From training? False, Data shape: (20, 38), Indices: [132 151 147 134  29  34 140  67  27  60  90 109   7 161  56  46 110 105
 145  71]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1] [0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1]
[[9 1]
 [1 9]]
[[9 1 1 9]
 [9 1 1 9]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precision = [0.9 0

 0.91705554 0.91705554 0.91705554 0.91100512 0.92151403 0.91670922
 0.91705554 0.91705554 0.92275964 0.91705554 0.91705554 0.91705554
 0.88854247 0.87116013 0.91641044 0.91645334 0.91641044 0.91641044
 0.92275964 0.92275964 0.92275964 0.84020926 0.88975259 0.91340587
 0.90368174 0.91675364 0.92145245 0.91544645 0.92275964 0.92275964
 0.88357143 0.84724067 0.88848967 0.91675364 0.92783901 0.92275964
 0.92145245 0.92275964 0.91641044 0.90441558 0.90476575 0.89512665
 0.91070322 0.92145245 0.90823944 0.91031424 0.92275964 0.91641044
 0.86380231 0.90188711 0.932219   0.92145245 0.91040443 0.92145245
 0.92275964 0.92275964 0.92275964 0.84814785 0.88322638 0.89407247
 0.91645334 0.90968775 0.91645334 0.90968775 0.92275964 0.92275964
 0.88295391 0.8788044  0.87641775 0.92275964 0.92275964 0.92275964
 0.92275964 0.92275964 0.92275964 0.88731231 0.9167825  0.90530951
 0.91705554 0.91705554 0.91705554 0.91705554 0.91705554 0.91705554
 0.87326791 0.90075759 0.91632025 0.91705554 0.92275964 0.9227

Fitting 5 folds for each of 405 candidates, totalling 2025 fits

        From training? False, Data shape: (20, 38), Indices: [132 151 147 134  29  34 140  67  27  60  90 109   7 161  56  46 110 105
 145  71]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1] [0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1]
[[9 1]
 [2 8]]
[[8 2 1 9]
 [9 1 2 8]]
[[17  3]
 [ 3 17]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 85.0
 Precision = [0.888

 0.70216074        nan 0.70216074        nan 0.70216074        nan
 0.70216074        nan 0.70216074        nan 0.70216074        nan
 0.70216074        nan 0.70216074        nan 0.70216074        nan
 0.75265547        nan 0.75265547        nan 0.75265547        nan
 0.75265547        nan 0.75265547        nan 0.75265547        nan
 0.84532695        nan 0.84532695        nan 0.84532695        nan
 0.84532695        nan 0.84532695        nan 0.84532695        nan
 0.86903952        nan 0.86903952        nan 0.86903952        nan
 0.86903952        nan 0.86903952        nan 0.86903952        nan
 0.88400306        nan 0.88400306        nan 0.88400306        nan
 0.88400306        nan 0.88400306        nan 0.88400306        nan
 0.88870894        nan 0.88870894        nan 0.88870894        nan
 0.88870894        nan 0.88870894        nan 0.88870894        nan]
 0.70221859        nan 0.70221859        nan 0.70221859        nan
 0.70221859        nan 0.70221859        nan 0.70221859      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [ 26  90   7  88 145   4  39 162  44  79  95 144  68  36  48 119  80  56
 104 163]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0] [1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0]
[[ 5  2]
 [ 0 13]]
[[13  0  2  5]
 [ 5  2  0 13]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precisio

 0.83752655 0.85491641 0.84992009 0.9058518  0.87822831 0.83271421
 0.81531674 0.80601716 0.83501371 0.82869697 0.85051708 0.82824009
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]
 1.         1.         1.         0.92399873 0.93127631 0.97758319
 1.         1.         1.         1.         1.         1.
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


Fitting 5 folds for each of 243 candidates, totalling 1215 fits

        From training? False, Data shape: (20, 38), Indices: [ 26  90   7  88 145   4  39 162  44  79  95 144  68  36  48 119  80  56
 104 163]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0] [1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0]
[[ 5  2]
 [ 1 12]]
[[12  1  2  5]
 [ 5  2  1 12]]
[[17  3]
 [ 3 17]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 85.0
 Precis

 0.9058518  0.91892369 0.91892369 0.87375223 0.91220779 0.90715898
 0.90615737 0.91892369 0.91892369 0.91892369 0.91892369 0.91892369
 0.84200794 0.88900772 0.9056884  0.90618793 0.91186147 0.92462779
 0.92462779 0.92462779 0.91821068 0.79788561 0.86942322 0.8913552
 0.91688121 0.90031025 0.92462779 0.92462779 0.92462779 0.9181884
 0.8316848  0.85976808 0.90217532 0.89159982 0.8985978  0.90500594
 0.9113925  0.92462779 0.92462779 0.8270088  0.8735023  0.88502637
 0.91177128 0.88493088 0.91137021 0.92462779 0.91177128 0.92462779
 0.86630937 0.90411215 0.89516106 0.8986039  0.89856724 0.89856724
 0.92462779 0.92462779 0.92462779 0.82507512 0.875403   0.90144958
 0.9113925  0.90504329 0.91250658 0.91647186 0.91177128 0.9181884
 0.85017538 0.86958916 0.88475936 0.91183919 0.90491645 0.9113925
 0.90491645 0.92462779 0.92462779 0.88695696 0.89045663 0.91892369
 0.90615737 0.91892369 0.91892369 0.91892369 0.91892369 0.91892369
 0.9056884  0.89395425 0.9056884  0.91892369 0.91892369 0.91892369

Fitting 5 folds for each of 405 candidates, totalling 2025 fits

        From training? False, Data shape: (20, 38), Indices: [ 26  90   7  88 145   4  39 162  44  79  95 144  68  36  48 119  80  56
 104 163]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0] [1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0]
[[ 5  2]
 [ 0 13]]
[[13  0  2  5]
 [ 5  2  0 13]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precis

 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.7191736         nan 0.7191736         nan 0.7191736         nan
 0.84928037        nan 0.84928037        nan 0.84928037        nan
 0.84928037        nan 0.84928037        nan 0.84928037        nan
 0.86844273        nan 0.86844273        nan 0.86844273        nan
 0.86844273        nan 0.86844273        nan 0.86844273        nan
 0.88789361        nan 0.88789361        nan 0.88789361        nan
 0.88789361        nan 0.88789361        nan 0.88789361        nan
 0.88789361        nan 0.88789361        nan 0.88789361        nan
 0.88789361        nan 0.88789361        nan 0.88789361        nan]
 0.71929044        nan 0.71929044        nan 0.71929044        nan
 0.71929044        nan 0.71929044        nan 0.71929044      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [152  33 151  63  14 155 142 145 149 104   6  17  21 122  61  92  70  88
 115  48]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1] [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 8  2]
 [ 0 10]]
[[10  0  2  8]
 [ 8  2  0 10]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precisio

 0.79513476 0.83753743 0.79054553 0.91574835 0.88769936 0.79368152
 0.80415328 0.81733716 0.81121211 0.80347017 0.81174603 0.80878786
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]
 1.         1.         1.         0.9265778  0.93590303 0.97465041
 0.99699248 1.         1.         1.         1.         1.
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


Fitting 5 folds for each of 243 candidates, totalling 1215 fits

        From training? False, Data shape: (20, 38), Indices: [152  33 151  63  14 155 142 145 149 104   6  17  21 122  61  92  70  88
 115  48]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1] [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 8  2]
 [ 0 10]]
[[10  0  2  8]
 [ 8  2  0 10]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precis

 0.92213491 0.92213491 0.92213491 0.90315593 0.87583077 0.90336258
 0.92213491 0.91574835 0.92213491 0.92213491 0.92213491 0.9161289
 0.88824738 0.91149648 0.9136584  0.91376238 0.9161289  0.9161289
 0.9161289  0.9161289  0.9161289  0.80118596 0.89851875 0.90238474
 0.90933124 0.90271069 0.90296873 0.92784919 0.9161289  0.91007848
 0.84247983 0.88597572 0.86010927 0.90196881 0.91007848 0.90974235
 0.91482171 0.90971179 0.90369193 0.86410557 0.86120827 0.850268
 0.87522342 0.90974235 0.90939603 0.92120827 0.91574835 0.92120827
 0.86709122 0.85002996 0.8875817  0.90715898 0.86310221 0.90849673
 0.90974235 0.91574835 0.9161289  0.83623867 0.9009944  0.88793821
 0.90846617 0.90238474 0.92692255 0.92692255 0.91482171 0.91515785
 0.8671855  0.88876238 0.8905691  0.92659339 0.89691831 0.90199767
 0.91482171 0.9161289  0.9161289  0.8923007  0.89922158 0.92146264
 0.9161289  0.92213491 0.92213491 0.9161289  0.92213491 0.92213491
 0.91250362 0.88835112 0.91578258 0.9161289  0.92213491 0.92213491

Fitting 5 folds for each of 405 candidates, totalling 2025 fits

        From training? False, Data shape: (20, 38), Indices: [152  33 151  63  14 155 142 145 149 104   6  17  21 122  61  92  70  88
 115  48]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1] [0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 8  2]
 [ 0 10]]
[[10  0  2  8]
 [ 8  2  0 10]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precis

 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.73292929        nan 0.73292929        nan 0.73292929        nan
 0.73292929        nan 0.73292929        nan 0.73292929        nan
 0.85132704        nan 0.85132704        nan 0.85132704        nan
 0.85132704        nan 0.85132704        nan 0.85132704        nan
 0.86298614        nan 0.86298614        nan 0.86298614        nan
 0.86298614        nan 0.86298614        nan 0.86298614        nan
 0.88604969        nan 0.88604969        nan 0.88604969        nan
 0.88604969        nan 0.88604969        nan 0.88604969        nan
 0.90912459        nan 0.90912459        nan 0.90912459        nan
 0.90912459        nan 0.90912459        nan 0.90912459        nan]
 0.71365079        nan 0.71365079        nan 0.71365079        nan
 0.71365079        nan 0.71365079        nan 0.71365079      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [ 83  65  29  61  95  67  25 150  76  26 123  63 151 149  97   0 140  99
  69 125]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0] [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0]
[[ 6  3]
 [ 0 11]]
[[11  0  3  6]
 [ 6  3  0 11]]
[[17  3]
 [ 3 17]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 85.0
 Precisio

 0.81831933 0.82117647 0.81842991 0.91955155 0.8280163  0.77544198
 0.81330945 0.82616756 0.79969697 0.77679654 0.79610722 0.79778474
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]
 1.         1.         1.         0.93105453 0.94973145 0.98254585
 0.99847328 1.         1.         1.         1.         1.
        nan        nan        nan        nan        nan        nan
        nan        nan        nan]


Fitting 5 folds for each of 243 candidates, totalling 1215 fits

        From training? False, Data shape: (20, 38), Indices: [ 83  65  29  61  95  67  25 150  76  26 123  63 151 149  97   0 140  99
  69 125]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0] [1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0]
[[ 7  2]
 [ 1 10]]
[[10  1  2  7]
 [ 7  2  1 10]]
[[17  3]
 [ 3 17]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 85.0
 Precis

 0.92596866 0.92596866 0.92596866 0.89569986 0.90535241 0.92596866
 0.92596866 0.92596866 0.92596866 0.9195821  0.92596866 0.92596866
 0.88572511 0.91954843 0.89248366 0.9195821  0.93134681 0.92596866
 0.92596866 0.92596866 0.92596866 0.87398268 0.88657818 0.89801923
 0.91316499 0.9195821  0.9195821  0.92596866 0.91955155 0.92596866
 0.8590404  0.90557486 0.9048985  0.90495134 0.90677844 0.92596866
 0.91955155 0.9195821  0.91316499 0.86749512 0.8932563  0.91313443
 0.90715898 0.89853423 0.91854314 0.9249297  0.91955155 0.9195821
 0.86502358 0.91679951 0.91103423 0.92596866 0.90808563 0.91924964
 0.9195821  0.92596866 0.9195821  0.89565544 0.90735033 0.89709001
 0.93134681 0.9195821  0.91818664 0.91851258 0.92596866 0.92596866
 0.89151812 0.87807103 0.89970291 0.9195821  0.90561151 0.91248254
 0.90672736 0.92596866 0.91955155 0.88368301 0.9195821  0.90325026
 0.91955155 0.92596866 0.92596866 0.92596866 0.92596866 0.92596866
 0.88506739 0.90966632 0.9195821  0.91996265 0.91955155 0.92596

Fitting 5 folds for each of 405 candidates, totalling 2025 fits

        From training? False, Data shape: (20, 38), Indices: [ 83  65  29  61  95  67  25 150  76  26 123  63 151 149  97   0 140  99
  69 125]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0] [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
[[ 6  3]
 [ 1 10]]
[[10  1  3  6]
 [ 6  3  1 10]]
[[16  4]
 [ 4 16]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 80.0
 Precis

 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.71356974        nan 0.71356974        nan 0.71356974        nan
 0.84462064        nan 0.84462064        nan 0.84462064        nan
 0.84462064        nan 0.84462064        nan 0.84462064        nan
 0.86374248        nan 0.86374248        nan 0.86374248        nan
 0.86374248        nan 0.86374248        nan 0.86374248        nan
 0.89258609        nan 0.89258609        nan 0.89258609        nan
 0.89258609        nan 0.89258609        nan 0.89258609        nan
 0.90909403        nan 0.90909403        nan 0.90909403        nan
 0.90909403        nan 0.90909403        nan 0.90909403        nan]
 0.71365079        nan 0.71365079        nan 0.71365079        nan
 0.71365079        nan 0.71365079        nan 0.71365079      

Fitting 5 folds for each of 21 candidates, totalling 105 fits

        From training? False, Data shape: (20, 38), Indices: [ 63  11 143 139 129  42  75  48  93  44 131 120  59 156  10  73 105  34
 163  32]
        All Columns: ['Dataset', 'Category', 'Subject_Name', 'Class', 'W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1', 'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1', 'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1', 'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1', 'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1', 'REM->S2', 'REM->S3', 'REM->REM']
        

        Feature shape: (20, 34), Target shape: (20,), Metadata: (20, 3) 
        
[0 1] [0 1]
[1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1] [1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1]
[[ 7  2]
 [ 0 11]]
[[11  0  2  7]
 [ 7  2  0 11]]
[[18  2]
 [ 2 18]]
Class-wise info: For multilevel internal scores fo label 0: 
 Accuracy = 90.0
 Precisio

In [None]:
# LogisticRegression?

In [None]:
# best_tr_model, tr_model, tr_model_scores_df, tr_target_and_prediction_df, ts_model, ts_model_scores_df, ts_target_and_prediction_df, ts_fold_info_df, exp_info_df
# all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df, all_exp_info_df

In [None]:
# ts_model 
# best_tr_model
# tr_model

In [None]:
print( len(ts_fold_info_df[:1]['Test'].values[0]), ts_fold_info_df[:1]['Test'].values[0] ) 
print( len(ts_fold_info_df[:1]['Validation'].values[0]), ts_fold_info_df[:1]['Validation'].values[0] ) 
print( len(ts_fold_info_df[:1]['Training'].values[0]), ts_fold_info_df[:1]['Training'].values[0] ) 
21+20+167 

In [None]:
exp_info_df

In [None]:
ts_fold_info_df

In [375]:
ts_model_scores_df

Unnamed: 0,Test_No,Model_No,method,model,model_parameters,model_scores,confusion_matrix,accuracy,precision,recall,sensitivity,specificity,f1_score,roc_auc
0,1,1,"GridSearchCV(cv=5, estimator=LogisticRegressio...",LogisticRegression(max_iter=50),"{'C': 1.0, 'max_iter': 50, 'penalty': 'l2'}",89.65,"[[19, 5], [1, 18]]",86.047,78.261,94.737,94.737,79.167,85.714,0.87
7,2,1,"GridSearchCV(cv=5, estimator=LogisticRegressio...",LogisticRegression(max_iter=50),"{'C': 1.0, 'max_iter': 50, 'penalty': 'l2'}",90.91,"[[20, 3], [0, 19]]",92.857,86.364,100.0,100.0,86.957,92.683,0.935
14,3,1,"GridSearchCV(cv=5, estimator=LogisticRegressio...",LogisticRegression(max_iter=50),"{'C': 1.0, 'max_iter': 50, 'penalty': 'l2'}",90.28,"[[20, 3], [0, 18]]",92.683,85.714,100.0,100.0,86.957,92.308,0.935
21,4,1,"GridSearchCV(cv=5, estimator=LogisticRegressio...",LogisticRegression(max_iter=50),"{'C': 1.0, 'max_iter': 50, 'penalty': 'l2'}",90.28,"[[20, 3], [0, 18]]",92.683,85.714,100.0,100.0,86.957,92.308,0.935
28,5,1,"GridSearchCV(cv=5, estimator=LogisticRegressio...",LogisticRegression(max_iter=50),"{'C': 1.0, 'max_iter': 50, 'penalty': 'l2'}",90.28,"[[18, 5], [0, 18]]",87.805,78.261,100.0,100.0,78.261,87.805,0.891
1,1,2,"GridSearchCV(cv=5, estimator=SVC(), n_jobs=50,...","SVC(kernel='linear', probability=True)","{'C': 1.0, 'kernel': 'linear', 'probability': ...",91.61,"[[20, 4], [0, 19]]",90.698,82.609,100.0,100.0,83.333,90.476,0.917
8,2,2,"GridSearchCV(cv=5, estimator=SVC(), n_jobs=50,...","SVC(kernel='linear', probability=True)","{'C': 1.0, 'kernel': 'linear', 'probability': ...",92.06,"[[20, 3], [0, 19]]",92.857,86.364,100.0,100.0,86.957,92.683,0.935
15,3,2,"GridSearchCV(cv=5, estimator=SVC(), n_jobs=50,...","SVC(kernel='linear', probability=True)","{'C': 1.0, 'kernel': 'linear', 'probability': ...",92.06,"[[19, 4], [0, 18]]",90.244,81.818,100.0,100.0,82.609,90.0,0.913
22,4,2,"GridSearchCV(cv=5, estimator=SVC(), n_jobs=50,...","SVC(kernel='linear', probability=True)","{'C': 1.0, 'kernel': 'linear', 'probability': ...",92.06,"[[20, 3], [0, 18]]",92.683,85.714,100.0,100.0,86.957,92.308,0.935
29,5,2,"GridSearchCV(cv=5, estimator=SVC(), n_jobs=50,...","SVC(kernel='linear', probability=True)","{'C': 1.0, 'kernel': 'linear', 'probability': ...",92.06,"[[19, 4], [0, 18]]",90.244,81.818,100.0,100.0,82.609,90.0,0.913


In [376]:
ts_model_scores_df[(ts_model_scores_df['Model_No']==6)]

Unnamed: 0,Test_No,Model_No,method,model,model_parameters,model_scores,confusion_matrix,accuracy,precision,recall,sensitivity,specificity,f1_score,roc_auc
5,1,6,"GridSearchCV(cv=5, estimator=RandomForestClass...","(DecisionTreeClassifier(criterion='entropy', m...","{'criterion': 'entropy', 'max_depth': 20, 'n_e...",94.22,"[[23, 1], [0, 19]]",97.674,95.0,100.0,100.0,95.833,97.436,0.979
12,2,6,"GridSearchCV(cv=5, estimator=RandomForestClass...","(DecisionTreeClassifier(max_depth=5, max_featu...","{'criterion': 'gini', 'max_depth': 5, 'n_estim...",92.46,"[[21, 2], [0, 19]]",95.238,90.476,100.0,100.0,91.304,95.0,0.957
19,3,6,"GridSearchCV(cv=5, estimator=RandomForestClass...","(DecisionTreeClassifier(criterion='entropy', m...","{'criterion': 'entropy', 'max_depth': 7, 'n_es...",93.68,"[[23, 0], [0, 18]]",100.0,100.0,100.0,100.0,100.0,100.0,1.0
26,4,6,"GridSearchCV(cv=5, estimator=RandomForestClass...","(DecisionTreeClassifier(criterion='entropy', m...","{'criterion': 'entropy', 'max_depth': 7, 'n_es...",93.68,"[[23, 0], [0, 18]]",100.0,100.0,100.0,100.0,100.0,100.0,1.0
33,5,6,"GridSearchCV(cv=5, estimator=RandomForestClass...","(DecisionTreeClassifier(criterion='entropy', m...","{'criterion': 'entropy', 'max_depth': 5, 'n_es...",92.69,"[[21, 2], [0, 18]]",95.122,90.0,100.0,100.0,91.304,94.737,0.957


In [None]:
rf1 = ts_model[1][6].estimator 
rf1 = ts_model[1][6] 
rf1.feature_importances_ 

In [None]:
ts_model 

In [None]:
ts_model_scores_df['method_name'][-5:].values, ts_model_scores_df['method_name'][-5:].values[0], type(ts_model_scores_df['method_name'][-5:].values[0]) 
ts_model_scores_df[-10:] 

In [None]:
tr_model_scores_df

In [None]:
ts_target_and_prediction_df

In [None]:
metadata_column, class_name

In [None]:
# Sort the dataframe based on the list as the column values
sorter = processed_dataset['Subject_Name'].values.tolist()
sorter

ts_target_and_prediction_df['Subject_Name'] = ts_target_and_prediction_df['Subject_Name'].astype("category")
ts_target_and_prediction_df['Subject_Name'] = ts_target_and_prediction_df['Subject_Name'].cat.set_categories(sorter)
ts_target_and_prediction_df
ts_target_and_prediction_df2 = ts_target_and_prediction_df.sort_values(['Subject_Name'])
ts_target_and_prediction_df2

In [None]:
ts_target_and_prediction_df2[(ts_target_and_prediction_df2['Subject_Name'].str.match(r'^n\d')==True)]

In [None]:
# SVC?

In [None]:
tr_target_and_prediction_df

### Test saved results

In [174]:
result_save_path

'./Results/_Classification/ML1001/'

In [157]:
result_save_path2 = result_save_path 
result_save_path2 = './Results/_Classification/ML1001/'
result_save_path2

'./Results/_Classification/ML1001/'

In [158]:
class_name

'Class'

In [159]:
if logger:
    stop_logger(logger) 
util, logger = start_logger('./Results/', '00')

classifier_obj2 = HumachLab_ML_CLassifiers(logger=logger, directory=result_save_path2, dataset=pd.DataFrame(columns=[class_name]), class_name=class_name, label_map={}, metadata_column=metadata_column, split_column=split_column, random_state_value=0, split_balance_pattern=[], check_result=True) 

classifier_obj2 


        Object is initialised with the following properties: 
        ###################################################################################################
        Dataset size: (0, 1), Columns: ['Class']
        Target class column name: Class
        Metadata column names: ['Dataset', 'Category', 'Subject_Name']
        Dataset split column on which the training and test sets will be devided: Class
        Is multi-class classification: False
        


<__main__.HumachLab_ML_CLassifiers at 0x1961723ff08>

In [161]:
best_tr_model2, tr_model2, tr_model_scores_df2, tr_target_and_prediction_df2, ts_model2, ts_model_scores_df2, ts_target_and_prediction_df2, ts_fold_info_df2, exp_info_df2  = classifier_obj2.load_results(result_save_path2)  



        Data is being loaded from: ./Results/_Classification/ML1001/
        
Start retrieving Test Models model from file...
Finish retrieving Test Models model from file...
Start retrieving Training Models model from file...
Finish retrieving Training Models model from file...
Start retrieving Best Training Models model from file...
Finish retrieving Best Training Models model from file...


In [None]:
ts_fold_info_df2

In [None]:
tt = ts_fold_info_df2[ ts_fold_info_df2['Model_No']==1 ] ['Selected_Features'] 
tt

for t in tt:
    print(t) 

In [None]:
ts_model_scores_df2

In [165]:
ts_model2[1][6].best_estimator_.feature_importances_

array([0.07192441, 0.05810145, 0.        , 0.        , 0.00383925,
       0.02089143, 0.03123972, 0.01320727, 0.        , 0.00033542,
       0.01480059, 0.02185206, 0.03852718, 0.06779852, 0.03535199,
       0.00030042, 0.02001342, 0.01768856, 0.00402689, 0.14115883,
       0.13452728, 0.0399399 , 0.00179775, 0.01935714, 0.00362998,
       0.01058212, 0.05096356, 0.11031693, 0.        , 0.02200328,
       0.00473437, 0.01572608, 0.00154661, 0.02381757])

In [None]:
# model_list

In [None]:
tr_model_scores_df2 

In [None]:
tr_model_scores_df2[ (tr_model_scores_df2['Model_No']==6) ]['accuracy'].min() 

In [None]:
# ts_model_scores_df['method_name'][-5:].values, ts_model_scores_df['method_name'][-5:].values[0], type(ts_model_scores_df['method_name'][-5:].values[0]) 
ts_model_scores_df2[-30:] 
ts_model_scores_df2[ ts_model_scores_df2['Model_No']==1 ] 

In [None]:
ts_model_scores_df2[ ts_model_scores_df2['Model_No']==6 ]

In [None]:
ts_model_scores_df2[ ts_model_scores_df2['Model_No']==7 ]

In [None]:
type(all_ts_model[1][1]), type(all_ts_model[1][1].estimator)

In [None]:
print(model_list)

In [None]:
dataset

In [None]:
# dataset.to_csv('./dataset.csv', index=False) 

In [292]:
# best_tr_model

#### Show and save RF Feature selection 

In [281]:
splitting_crieteria, (splitting_crieteria[0][0]+1)

([(5, 0), (5, 20)], 6)

In [282]:
# ts_model 
processed_dataset 

Unnamed: 0,Dataset,Category,Subject_Name,Class,W->S1,W->S2,W->S3,W->S4,W->REM,S1->W,...,S4->S1,S4->S2,S4->S3,S4->S4,S4->REM,REM->W,REM->S1,REM->S2,REM->S3,REM->REM
0,CAP_Sleep,brux,brux1,1,0.190476,0.009524,0.0,0.0,0.000000,0.030769,...,0.000000,0.004926,0.024631,0.970443,0.0,0.033520,0.000000,0.000000,0.0,0.966480
1,CAP_Sleep,brux,brux2,1,0.174603,0.000000,0.0,0.0,0.000000,0.061728,...,0.000000,0.000000,0.012048,0.984940,0.0,0.019324,0.004831,0.000000,0.0,0.975845
2,CAP_Sleep,sdb,sdb1,1,0.123288,0.000000,0.0,0.0,0.000000,0.099237,...,0.010152,0.005076,0.030457,0.949239,0.0,0.009346,0.000000,0.028037,0.0,0.962617
3,CAP_Sleep,sdb,sdb2,1,0.044872,0.006410,0.0,0.0,0.000000,0.011364,...,0.000000,0.008475,0.016949,0.966102,0.0,0.000000,0.000000,0.025641,0.0,0.974359
4,CAP_Sleep,sdb,sdb3,1,0.046948,0.018779,0.0,0.0,0.000000,0.108696,...,0.000000,0.025974,0.038961,0.935065,0.0,0.062500,0.000000,0.000000,0.0,0.937500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Sleep_EDFX,n,ST7191,0,0.242857,0.014286,0.0,0.0,0.000000,0.044118,...,0.013889,0.013889,0.166667,0.805556,0.0,0.000000,0.020000,0.015000,0.0,0.965000
204,Sleep_EDFX,n,ST7201,0,0.555556,0.055556,0.0,0.0,0.055556,0.014493,...,0.000000,0.038462,0.615385,0.346154,0.0,0.044776,0.007463,0.014925,0.0,0.932836
205,Sleep_EDFX,n,ST7211,0,0.099415,0.000000,0.0,0.0,0.005848,0.116667,...,0.000000,0.016949,0.254237,0.728814,0.0,0.025641,0.000000,0.019231,0.0,0.955128
206,Sleep_EDFX,n,ST7221,0,0.180000,0.000000,0.0,0.0,0.000000,0.052133,...,0.000000,0.000000,0.000000,0.000000,0.0,0.021645,0.043290,0.000000,0.0,0.935065


In [349]:
ts_model[3][6].feature_names

In [377]:
# ts_model[1][6].best_estimator_.feature_names = ['hi', 'hello', 5, 1.5] 
ts_model[1][6].feature_names, ts_model[1][6].best_estimator_.feature_importances_

(array(['W->S1', 'W->S2', 'W->S3', 'W->S4', 'W->REM', 'S1->W', 'S1->S1',
        'S1->S2', 'S1->S3', 'S1->S4', 'S1->REM', 'S2->W', 'S2->S1',
        'S2->S2', 'S2->S3', 'S2->S4', 'S2->REM', 'S3->W', 'S3->S1',
        'S3->S2', 'S3->S3', 'S3->S4', 'S3->REM', 'S4->W', 'S4->S1',
        'S4->S2', 'S4->S3', 'S4->S4', 'S4->REM', 'REM->W', 'REM->S1',
        'REM->S2', 'REM->S3', 'REM->REM'], dtype=object),
 array([0.06424144, 0.06910475, 0.        , 0.        , 0.0019275 ,
        0.01064262, 0.01663351, 0.02011725, 0.        , 0.        ,
        0.01722061, 0.02255312, 0.02964559, 0.06868744, 0.03052843,
        0.        , 0.02427807, 0.01375296, 0.0005258 , 0.08010924,
        0.1580388 , 0.04769964, 0.        , 0.01180538, 0.00118775,
        0.01821783, 0.05968414, 0.12281662, 0.        , 0.02361403,
        0.01474488, 0.0396913 , 0.        , 0.0325313 ]))

In [300]:
# for lll in range(1, 6):
#     print(lll, ts_model[lll][6].best_estimator_, len(ts_model[lll][6].feature_names), ts_model[lll][6].best_estimator_.feature_importances_.shape[0], ts_model[lll][6].feature_names, ts_model[lll][6].best_estimator_.feature_importances_) 

In [308]:
# rf_feat_importance_df = pd.DataFrame() 

# for feat_ss in range(1, (splitting_crieteria[0][0]+1)):
# # #     print('---> ', feat_ss, rf_feat_importance_df)
# #     rf_feat_importance_df[f'Feature-{feat_ss}'] = ts_model[feat_ss][6].feature_names  
# # #     print(feat_ss, ts_model[fld_ss][6].best_estimator_.feature_importances_)
# #     rf_feat_importance_df[f'Fold-{feat_ss}'] = ts_model[feat_ss][6].best_estimator_.feature_importances_ 
#     tdf = pd.DataFrame({f'Feature-{feat_ss}': ts_model[feat_ss][6].feature_names,
#                        f'Fold-{feat_ss}': ts_model[feat_ss][6].best_estimator_.feature_importances_ }) 
#     rf_feat_importance_df = pd.concat([rf_feat_importance_df, tdf], axis=1) 
    
# rf_feat_importance_df

In [385]:
rf_feat_importance_df = pd.DataFrame() 
all_fts = processed_dataset.columns.values[4:]
rf_feat_importance_df[f'Feature'] = all_fts 
# rf_feat_importance_df.set_index([f'Feature'])

for feat_ss in range(1, (splitting_crieteria[0][0]+1)):
#     print('---> ', feat_ss, rf_feat_importance_df)
    fts = ts_model[feat_ss][6].feature_names 
#     print(feat_ss, ts_model[fld_ss][6].best_estimator_.feature_importances_)
    fts_imp = ts_model[feat_ss][6].best_estimator_.feature_importances_ 
    imp_lst = [fts_imp[fts.index(f)] if (f in fts and fts.index(f)>=0) else None for f in all_fts] # [MM[LL.index(f)] for f in LL if f in NN] 
    rf_feat_importance_df[f'Fold-{feat_ss}'] = imp_lst 
    
rf_feat_importance_df

Unnamed: 0,Feature,Fold-1,Fold-2,Fold-3,Fold-4,Fold-5
0,W->S1,0.04002,0.048111,0.064094,0.08114,0.048286
1,W->S2,0.052056,0.032538,0.07773,0.051324,0.065048
2,W->S3,0.0,0.0,0.00105,0.0,0.0
3,W->S4,0.0,0.0,0.000519,0.0,0.000248
4,W->REM,0.0,0.001828,0.003813,0.0,0.001737
5,S1->W,0.019573,0.028132,0.032371,0.014908,0.01706
6,S1->S1,0.037258,0.018897,0.01777,0.021126,0.018986
7,S1->S2,0.012524,0.007842,0.021617,0.016889,0.010687
8,S1->S3,0.001264,0.0,0.000389,0.0,0.0
9,S1->S4,0.0,0.0,0.0,0.0,0.000657


In [326]:
34==23+11

True

In [386]:
result_save_path, f'{result_save_path}rf_feat_importance.csv' 

('./Results/_Classification/ML1003/',
 './Results/_Classification/ML1003/rf_feat_importance.csv')

In [387]:
rf_feat_importance_df.to_csv(f'{result_save_path}rf_feat_importance.csv', index=False)  
rf_feat_importance_df

Unnamed: 0,Feature,Fold-1,Fold-2,Fold-3,Fold-4,Fold-5
0,W->S1,0.04002,0.048111,0.064094,0.08114,0.048286
1,W->S2,0.052056,0.032538,0.07773,0.051324,0.065048
2,W->S3,0.0,0.0,0.00105,0.0,0.0
3,W->S4,0.0,0.0,0.000519,0.0,0.000248
4,W->REM,0.0,0.001828,0.003813,0.0,0.001737
5,S1->W,0.019573,0.028132,0.032371,0.014908,0.01706
6,S1->S1,0.037258,0.018897,0.01777,0.021126,0.018986
7,S1->S2,0.012524,0.007842,0.021617,0.016889,0.010687
8,S1->S3,0.001264,0.0,0.000389,0.0,0.0
9,S1->S4,0.0,0.0,0.0,0.0,0.000657


In [388]:
rf_feat_importance_df = pd.read_csv(f'{result_save_path}rf_feat_importance.csv') 
rf_feat_importance_df 

Unnamed: 0,Feature,Fold-1,Fold-2,Fold-3,Fold-4,Fold-5
0,W->S1,0.04002,0.048111,0.064094,0.08114,0.048286
1,W->S2,0.052056,0.032538,0.07773,0.051324,0.065048
2,W->S3,0.0,0.0,0.00105,0.0,0.0
3,W->S4,0.0,0.0,0.000519,0.0,0.000248
4,W->REM,0.0,0.001828,0.003813,0.0,0.001737
5,S1->W,0.019573,0.028132,0.032371,0.014908,0.01706
6,S1->S1,0.037258,0.018897,0.01777,0.021126,0.018986
7,S1->S2,0.012524,0.007842,0.021617,0.016889,0.010687
8,S1->S3,0.001264,0.0,0.000389,0.0,0.0
9,S1->S4,0.0,0.0,0.0,0.0,0.000657


# ML Classifier class

In [92]:
### Data preprocessor class 
from scipy.stats import ttest_ind

class DataPreprocessor():
    
    def __init__(self):
        return
    
    def calculate_p_and_auc_for_feature(self, feat_data, label_data, binary_class=True): 
        # Extract the independent variable and dependent variable as dataframe and series 
        X = feat_data.copy()  # Replace 'independent_variable' with your column name
        y = label_data.copy()  # Replace 'dependent_variable' with your column name
        # print(X, y) 
        #print("111 Binary classification?", binary_class)

        # Perform a one-way ANOVA and calculate the p-value
        p_value = 1.0
        if binary_class:
            _, p_value = ttest_ind(X[y==0], X[y==1])  # Assuming binary classification 
            #print("222 Binary classification?", binary_class)
        else: 
            groups = [X[y == label] for label in np.unique(y)] # For multiclass classification 
            _, p_value = f_oneway(*groups)
            #print("222 Not binary classification?", binary_class)
        p_value = p_value[0] 

        # Display the p-value
        #print("P-value:", p_value)

        # Encode the target variable - For multiclass 
        if not binary_class: 
            label_encoder = LabelEncoder()
            y = label_encoder.fit_transform(y)
            #print("333 Not binary classification?", binary_class)

        # Fit a logistic regression model and calculate the AUC
        model = None 
        #if binary_class:
        #    model = LogisticRegression()
        #else:
        #    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        # model = LogisticRegression()
        # model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
        model = SVC(probability=True)
        # model = SVC(C=1.0, random_state=1, kernel='linear', probability=True)
        model.fit(X, y)
        y_pred_proba = model.predict_proba(X)
        if binary_class: 
            y_pred_proba = y_pred_proba[:, 1]
            #print("444 Binary classification?", binary_class)

        # print(y_pred_proba) 
        auc = 0.0 
        if binary_class:
            auc = roc_auc_score(y, y_pred_proba)
            #print("555 Binary classification?", binary_class)
        else:
            auc = roc_auc_score(y, y_pred_proba, multi_class='ovr')
            #print("555 Not binary classification?", binary_class)

        # Display the AUC
        #print("AUC:", auc)
        return p_value, auc 
    
    def calculate_p_and_auc_for_dataset(self, all_feats_df, label_df, binary_class=True): 
        feat_cols = all_feats_df.columns.values.tolist() 

        all_p_list = [] 
        all_auc_list = [] 
        for ft in feat_cols:
            feat_data = all_feats_df[[ft]].copy() 
            label_data = label_df.copy() 
            # print("HHHHHH", feat_data.shape, type(feat_data), label_data.shape, type(label_data), binary_class)
            p, auc = self.calculate_p_and_auc_for_feature(feat_data, label_data, binary_class=binary_class) 
            all_p_list.append(p) 
            all_auc_list.append(auc) 

        all_p_and_auc_df = pd.DataFrame( {"Features": feat_cols, f"P_Value_{'bin' if binary_class else 'multi'}": all_p_list, f"AUC_{'bin' if binary_class else 'multi'}": all_auc_list} )    
        return all_p_and_auc_df 
    
    def get_selected_feature_list_based_on_PAUC(self, tmp_df, p_threshold=0.05, auc_threshold=0.5, sort=False): 
        cols = tmp_df['Features'].values.tolist() 
        if p_threshold:
            tmp_df = tmp_df[(tmp_df['P_Value_bin']<=p_threshold)]
        if auc_threshold:
            tmp_df = tmp_df[(tmp_df['AUC_bin']>=auc_threshold)]
        if sort:
            tmp_df = tmp_df.sort_values(['P_Value_bin', 'AUC_bin'], ascending = [True, False])
        selected_features = tmp_df['Features'].values.tolist() 
        return selected_features
    
    def select_pandauc_based_features(self, all_feats_df, label_df, binary_class=True, p_threshold=None, auc_threshold=None, sort=False): 
        tmp_df = self.calculate_p_and_auc_for_dataset(all_feats_df, label_df, binary_class=binary_class)
        selected_features = self.get_selected_feature_list_based_on_PAUC(tmp_df, p_threshold=p_threshold, auc_threshold=auc_threshold, sort=sort)        
        return selected_features 

In [93]:
### Custom splitter class 
import math
from sklearn.model_selection import BaseCrossValidator

# class MyCustomSplitter(BaseCrossValidator):
class MyCustomSplitter():
#     def __init__(self, n_splits):
#         self.n_splits = n_splits
        
#     def set_criteria(self, splt_cri, groups=[['n'], ['SC', 'ST']]):
#         self.splt_cri = splt_cri
#         self.groups = groups
        
    def __init__(self, splt_cri, groups=[['n'], ['SC', 'ST']]):
        self.splt_cri = splt_cri
        self.groups = groups

    def split(self, x, y=None):
        groups = self.groups 
        fold = self.splt_cri[0] 
        percent = self.splt_cri[1]
        y = np.array(y) 
        unique_y = np.unique(y)
        num_y = len(unique_y)
        y_indices = np.arange(num_y)
        # print('AAAA--->> ', unique_y, y_indices)

        remain_x = [i for i,v in enumerate(x)]
        all_filterred_x = []
        for grp in groups:
            # print('Group', grp)
            tmp_filterred_x = [] 
            for it in grp: 
                gg = f"^{it}\d"
                r = re.compile(gg) 
                filterred_x = list(filter(r.match, x))
                # print('BBBB--->> ', gg, filterred_x) 
                filterred_x_ind = [i for i,v in enumerate(x) if v in filterred_x]
                tmp_filterred_x.extend(filterred_x_ind) 
                # tmp_filterred_x.extend(filterred_x) 
            remain_x = [i for i in remain_x if i not in tmp_filterred_x] 
            all_filterred_x.append(tmp_filterred_x) 
            # print('CCCC--->> ', all_filterred_x)
            # print('222--->', grp, remain_x) 

        # print('--->', remain_x) 
        remain_x_ind = [i for i,v in enumerate(x) if i in remain_x]
        all_filterred_x.append(remain_x_ind) 
        # all_filterred_x.append(remain_x)     
        all_dat = [item for row in all_filterred_x for item in row]
        # print('DDDD--->> ', all_filterred_x, all_dat)

        num_groups = len(all_filterred_x) 
        groups_item_len = [len(it) for it in all_filterred_x] 
        groups_item_ratio = [int(it/fold) if (it/fold)==int(it/fold) else int(it/fold+1) for it in groups_item_len] 
        # print('EEEE--->> ', num_groups, groups_item_len, groups_item_ratio)

        main_grps = [it//fold  for it in groups_item_len]  # math.floor(it/fold) 
        ext_grps = [it%fold for it in groups_item_len] 
        # print('FFFF--->> ', main_grps, ext_grps)  

        all_fold_values = [[] for _ in range(fold)] 
        for i, dat in enumerate(all_fold_values):
            test_dat = [] 
            for l, (j,k,fd) in enumerate(zip(main_grps, ext_grps, all_filterred_x)): 
                # print('---->> ', i, j, k, i*j, i*j+j*1, i<k, fold*j+i)
                dd = fd[i*j : i*j+j*1]
                ex_dd = [fd[fold*j+i]] if i<k else []
                dd.extend(ex_dd)
                # print('---->> ', i, j, k, i*j, i*j+j*1, i<k, fold*j+i, dd, ex_dd)
                test_dat.extend( dd ) 
            test_dat = list(set(test_dat))
            train_dat = list( set(all_dat)-set(test_dat) ) 
            # print('GGGG--->> ', test_dat, train_dat)
            yield train_dat, test_dat 

In [379]:
### ML Classifier class 
##### import random
import multiprocessing as mp

from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, ShuffleSplit, LeavePOut, KFold, ParameterGrid

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.model_selection import GridSearchCV, RandomizedSearchCV



###########################################################
# HumachLab_ML_CLassifiers     



# ### All models' implementation

class HumachLab_ML_CLassifiers:
    
    def print_message(self):
#         ---------------------------------------------------------------------------------------------------
#         ===================================================================================================
#         ###################################################################################################
#         ***************************************************************************************************
        self.logger.info(f"Hello from HumachLab_ML_CLassifiers class")
    
    
    
    def __init__(self, logger, directory, dataset, class_name, label_map, metadata_column, split_column, random_state_value, split_balance_pattern, check_result=False): 
        self.logger = logger 
        self.directory = directory
        self.dataset = dataset 
        self.class_name = class_name
        self.label_map = label_map
        self.metadata_column = metadata_column
        self.split_column = split_column 
        self.is_multiclass = True if len(dataset[class_name].unique().tolist())>2 else False
        self.random_state_value = random_state_value
        self.split_balance_pattern = split_balance_pattern
        if not check_result:
            self.experiment_info = {
                'logger':logger, 'directory':directory, 'dataset_size':dataset.shape, 'dataset_columns':dataset.columns.values.tolist(), 'metadata_column':metadata_column, 
                'class_name':class_name, 'label_map':label_map, 'split_column':split_column, 'total_unique_classes':dataset[class_name].value_counts().keys().tolist(), 
                'total_unique_classes':dataset[class_name].value_counts().values.tolist()
                }
        
        self.best_model_scoring_metrics=[ML_Performace_Metrics.RECL, ML_Performace_Metrics.F1SCR] 
        
        self.logger.info(f"""
        Object is initialised with the following properties: 
        ###################################################################################################
        Dataset size: {self.dataset.shape}, Columns: {self.dataset.columns.values.tolist()}
        Target class column name: {self.class_name}
        Metadata column names: {self.metadata_column}
        Dataset split column on which the training and test sets will be devided: {self.split_column}
        Is multi-class classification: {self.is_multiclass}
        """) 
        return  
    
    
    def convert_list_to_string(self, lst):
        lst = [str(l) for l in lst]        
        return '* '.join(lst) 
    
    
    
    def classify(self, should_use_params, splitting_crieteria, model_list, is_validate_models, result_save_path, exp_name, exp_detail, apply_feature_selection, custom_splitter):
        self.splitting_crieteria = splitting_crieteria    ### for test & training (validation) splitting_crieteria (n): n=0 -loso, n>0 -n-fold, n<0 -shuffled random splitting with n% testing
        self.model_list = model_list 
        self.should_use_params = should_use_params
        self.is_validate_models = is_validate_models 
        self.result_save_path = result_save_path 
        self.exp_name = exp_name 
        self.exp_detail = exp_detail 
        self.is_binary_classification = not self.is_multiclass
        self.apply_feature_selection = apply_feature_selection 
        self.selected_features = None 
        self.custom_splitter = custom_splitter
        
        # self.experiment_info['exp_name'] = exp_name 
        self.experiment_info.update(exp_detail)
        self.experiment_info['apply_feature_selection'] = apply_feature_selection
        #self.experiment_info['selected_features'] = self.selected_features 
        self.experiment_info['is_multiclass_classification'] = self.is_multiclass
        self.experiment_info['model_list'] = model_list 
        self.experiment_info['should_use_params'] = should_use_params
        self.experiment_info['is_validate_models'] = is_validate_models
        self.experiment_info['result_save_path'] = result_save_path
        self.experiment_info['random_state_value'] = self.random_state_value 
        self.experiment_info['custom_splitter'] = self.custom_splitter
        self.experiment_info['split_balance_pattern'] = self.split_balance_pattern
        tmp = splitting_crieteria[0] 
        self.experiment_info['test_split_crieteria'] = tmp 
        self.experiment_info['test_split_details'] = f'Leave-one-out' if tmp[0]==0 else (f'{tmp[0]}-fold cross validation' if (tmp[0]>0 and tmp[1]<=0) else f'{tmp[0]}-fold {tmp[1]}% random test splitting') 
        tmp = splitting_crieteria[1] 
        self.experiment_info['training_split_crieteria'] = tmp 
        self.experiment_info['training_split_details'] = f'Leave-one-out' if tmp[0]==0 else (f'{tmp[0]}-fold cross validation' if (tmp[0]>0 and tmp[1]<=0) else f'{tmp[0]}-fold {tmp[1]}% random test splitting') 
        self.experiment_info['model_selection_matrics'] = self.best_model_scoring_metrics 
                                                                                                  
                                                                                                                      
        self.logger.info(f"""
        Classification is set with the following parameters: 
        ###################################################################################################
        Splitting crieteria: {self.splitting_crieteria}
        Test split: {f'Leave-one-out' if splitting_crieteria[0] [0]==0 else (f'{splitting_crieteria[0] [0]}-fold cross validation' if (splitting_crieteria[0] [0]>0 and splitting_crieteria[0] [1]<=0) else f'{splitting_crieteria[0] [0]}-fold {splitting_crieteria[0] [1]}% random test splitting') }
        Training split: {f'Leave-one-out' if splitting_crieteria[1] [0]==0 else (f'{splitting_crieteria[1] [0]}-fold cross validation' if (splitting_crieteria[1] [0]>0 and splitting_crieteria[1] [1]<=0) else f'{splitting_crieteria[1] [0]}-fold {splitting_crieteria[1] [1]}% random test splitting') }
        List of ML models that will be applied: {[mn.value for mn in self.model_list]}
        Use parameters for model: {self.should_use_params}
        Is validate the model (or only train): {self.is_validate_models} 
        Classification results will be saved in the directory: {self.result_save_path}
        """) 
        all_exp_info_df = pd.DataFrame(self.experiment_info.items(), columns=['Information', 'Description']) 
        all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df  = self.test() 
        
        self.save_results(self.directory, all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df, all_exp_info_df) 
        
        return all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df, all_exp_info_df 
    
    
    
    def load_results(self, save_directory):
        # './Results/_Classification/ML001/'
        self.logger.info(f"""
        Data is being loaded from: {save_directory}
        """) 
        save_path = f"{save_directory}all_tr_scores.csv" 
        all_tr_scores_df = pd.read_csv(save_path) 
        
        save_path = f"{save_directory}all_tr_prediction.csv" 
        all_tr_prediction_df = pd.read_csv(save_path) 
        
        save_path = f"{save_directory}all_ts_scores.csv" 
        all_ts_scores_df = pd.read_csv(save_path) 
        
        save_path = f"{save_directory}all_ts_prediction.csv" 
        all_ts_prediction_df = pd.read_csv(save_path)         
        
        save_path = f"{save_directory}all_ts_fold_info.csv" 
        all_ts_fold_info_df = pd.read_csv(save_path)         
        
        save_path = f"{save_directory}all_exp_info.csv" 
        all_exp_info_df = pd.read_csv(save_path)  
        
        new_save_directory = f"{save_directory}/Models/"
        
        save_path = f"{new_save_directory}ts_model" 
        all_ts_model = self.load_models_from_file(save_path, 'Test Models')
        
        save_path = f"{new_save_directory}tr_model" 
        all_tr_model = self.load_models_from_file(save_path, 'Training Models')
        
        save_path = f"{new_save_directory}best_tr_model" 
        all_best_tr_model = self.load_models_from_file(save_path, 'Best Training Models')
        
        return all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df, all_exp_info_df 


    def load_models_from_file(self, save_path, model_type):
        models_dict = {} 
        
        save_path = f'{save_path}*'
        files = self.sort_string_list(glob.glob(save_path)) 
        files
        selected_files = [[int(fn) for fn in f[len(save_path):].split('.')[0].split('_')] for f in files]
        selected_files
        
        self.logger.info(f'Start retrieving {model_type} model from file...')
        model_dict = {}  
        for i, (ind, fl) in enumerate(zip(selected_files, files)):
            if len(ind)==3:
                tsi, tri, modi = ind
                mod = None 
                with open(fl, "rb") as f:
                    mod = pickle.load(f) 
                if tsi not in model_dict.keys(): 
                    model_dict[tsi] = {}
                if tri not in model_dict[tsi].keys(): 
                    model_dict[tsi][tri] = {} 
                if modi not in model_dict[tsi][tri].keys(): 
                    model_dict[tsi][tri][modi] = mod 
            elif len(ind)==2:
                tsi, modi = ind
                mod = None 
                with open(fl, "rb") as f:
                    mod = pickle.load(f) 
                if tsi not in model_dict.keys(): 
                    model_dict[tsi] = {}
                if modi not in model_dict[tsi].keys(): 
                    model_dict[tsi][modi] = mod 
            else:
                self.logger.info(f'Doesn\'t identify {model_type} model file to retrieve...')
        
        model_dict
        self.logger.info(f'Finish retrieving {model_type} model from file...')
        return model_dict 
    
    
    
    def sort_string_list(self, string_list):
        ## ref: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
        """ Sort the given list in the way that humans expect.
        """
        convert = lambda text: int(text) if text.isdigit() else text
        alphanum_key = lambda key: [ convert(c.replace("_","")) for c in re.split('([0-9]+)', key) ]
        string_list.sort( key=alphanum_key )
        return string_list
    
    
        
    def save_results(self, save_directory, all_best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df, all_exp_info_df):
        # './Results/_Classification/ML001/'
        save_path = f"{save_directory}all_tr_scores.csv" 
        all_tr_scores_df.to_csv(save_path, index=False) 
        
        save_path = f"{save_directory}all_tr_prediction.csv" 
        all_tr_prediction_df.to_csv(save_path, index=False) 
        
        save_path = f"{save_directory}all_ts_scores.csv" 
        all_ts_scores_df.to_csv(save_path, index=False) 
        
        save_path = f"{save_directory}all_ts_prediction.csv" 
        all_ts_prediction_df.to_csv(save_path, index=False) 
        
        save_path = f"{save_directory}all_ts_fold_info.csv" 
        all_ts_fold_info_df.to_csv(save_path, index=False) 
        
        save_path = f"{save_directory}all_exp_info.csv" 
        all_exp_info_df.to_csv(save_path, index=False) 
        
        new_save_directory = self.create_directory(save_directory, 'Models')
        
        save_path = f"{new_save_directory}ts_model" 
        self.save_models_to_file(save_path, all_ts_model, 'Test Models')
        
        save_path = f"{new_save_directory}tr_model" 
        self.save_models_to_file(save_path, all_tr_model, 'Training Models')
        
        save_path = f"{new_save_directory}best_tr_model" 
        self.save_models_to_file(save_path, all_best_tr_model, 'Best Training Models')
        
        return
    
    
    def create_directory(self, path, dir_name): 
        new_directory = f"{path}/{dir_name}/"
        
        if (not os.path.exists(new_directory)):
            try:
                os.makedirs(new_directory, exist_ok = True)
                print(f"Directory successfully created at path: {new_directory}") 
            except OSError as error:
                print(f"Directory cannot be created at path: {new_directory}") 
        else:
            print(f"Directory already exists at path: {new_directory}") 
            
        return new_directory


    def save_models_to_file(self, save_path, models_dict, model_type):
        self.logger.info(f'Start saving model to file...')
        if model_type=='Training Models':
            for tsfi, ts_dat in models_dict.items():
                for trfi, tr_dat in ts_dat.items():
                    for modi, mod in tr_dat.items():
                        new_save_path = f'{save_path}_{tsfi}_{trfi}_{modi}.dat'
                        try:
                            with open(new_save_path, 'wb') as f:
                                pickle.dump(mod, f)
                                print( f'{model_type} is written to the file: {new_save_path}\n' )
                        except:
                            print( f'Problem creating {model_type} file: {new_save_path}\n' )
        else:
            for tsfi, ts_dat in models_dict.items():
                for modi, mod in ts_dat.items():
                    new_save_path = f'{save_path}_{tsfi}_{modi}.dat'
                    try:
                        with open(new_save_path, 'wb') as f:
                            pickle.dump(mod, f)
                            print( f'{model_type} is written to the file: {new_save_path}\n' )
                    except:
                        print( f'Problem creating {model_type} file: {new_save_path}\n' )
        self.logger.info(f'Finish saving model to file...')
        return
    
    
    
    def evaluate_test_data(self, ind, model_list, test_ids, best_tr_model):
        self.logger.info(f"""
        ### MODEL EVALUATION PHASE 
        EVALUATION {ind} START... XXXXX 
        ---------------------------------------------------------------------------------------------------
        """) 
        #print("HHHHHH3333", best_tr_model)
        
        X_test, y_test, meta_dat_df = self._get_data_from_indices(test_ids)
        #print("meta33->", meta_dat_df)     
#         ### Call features selection algorithm 
#         if self.selected_features: 
#             X_test = X_test[self.selected_features]
        
        
        ts_score_df, ts_prediction_df = pd.DataFrame(), pd.DataFrame() 
        # all_ts_model, all_ts_scores_df, all_ts_prediction_df = {}, pd.DataFrame(), pd.DataFrame()
        for (modi, model), classifier_method in zip(best_tr_model.items(), model_list) : 
            y_pred = model.predict(X_test) 
            y_pred_proba = model.predict_proba(X_test) 
            if modi==1:
                # meta_dat_df.reset_index(drop=True, inplace=True) 
                ts_prediction_df = pd.concat([ts_prediction_df, meta_dat_df]) 
                ts_prediction_df.reset_index(drop=True, inplace=True) 
                ts_prediction_df[self.class_name] = y_test 
                
            ts_prediction_df[f"Prediction_{ML_Classifiers.get_short_form(str(classifier_method.value))}"] = y_pred 
            for p_ind in range(y_pred_proba.shape[1]):
                ts_prediction_df[f"Prediction_Proba_{p_ind}_{ML_Classifiers.get_short_form(str(classifier_method.value))}"] = y_pred_proba[:, p_ind].tolist()
            
            scores_df = self.calculate_model_scores(model, y_test, y_pred, y_pred_proba)
            scores_df.insert(0, "Model_No", modi) 
            ts_score_df = pd.concat([ts_score_df, scores_df]) 
            
        ts_score_df.insert(0, "Test_No", ind) 
        ts_prediction_df.insert(0, "Test_No", ind) 
        return ts_score_df, ts_prediction_df 
    
    
    
    def test(self):
        # splitter = self.get_data_splitter(self.splitting_crieteria[0], stratified=True, custom=True) #Test split cusomised: usually LOSO or 10-fold 
        # splitter = self.get_data_splitter(self.splitting_crieteria[0], stratified=True, custom=False) #Test split: usually LOSO or 10-fold 
        splitter = self.get_data_splitter(self.splitting_crieteria[0], stratified=True, custom=self.custom_splitter) #Test split cusomised: usually LOSO or 10-fold 
        split_data_list = self.dataset[self.split_column].values.tolist() 
        class_data_list = self.dataset[self.class_name].values.tolist() 
        
        all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df = {}, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()  
        cum_best_tr_model, cum_tr_model, cum_tr_scores_df, cum_tr_prediction_df, cum_tr_fold_info_df = {}, {}, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
        for tsi, (train_all_ids, test_ids) in enumerate(splitter.split(split_data_list, class_data_list)): 
            ### Train-test split: fold based 
            ts_dat = self.dataset[self.split_column][test_ids].values.tolist() 
            tr_all_dat = self.dataset[self.split_column][train_all_ids].values.tolist() 
            ind = tsi+1 
            self.selected_features = None ### Resetting feature selection list 
            self.logger.info(f"""
            ### MODEL TEST PHASE 
            TEST {ind} START... XXXXX 
            ===================================================================================================
            Test=> {len(ts_dat)} {(ts_dat)} 
            Training (Including Validation)=> {len(tr_all_dat)} {(tr_all_dat)} 
            """) 
            best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_tr_fold_info_df = self.train(ind, model_list, train_all_ids, tr_splitting_crieteria=self.splitting_crieteria[1]) #tr_splitting_crieteria: <0 random split, >0 fold
            cum_tr_model[ind] = all_tr_model 
            cum_best_tr_model[ind] = best_tr_model 
            if self.is_validate_models:
                all_tr_scores_df.insert(0, "Test_No", ind) 
                all_tr_prediction_df.insert(0, "Test_No", ind) 
            all_tr_fold_info_df.insert(0, "Test_No", ind) 
            all_tr_fold_info_df.insert(4, "Test", [ts_dat]*all_tr_fold_info_df.shape[0]) 
            all_tr_fold_info_df.insert(4, "Selected_Features", [self.selected_features]*all_tr_fold_info_df.shape[0]) 
            
            cum_tr_scores_df = pd.concat([cum_tr_scores_df, all_tr_scores_df]) 
            cum_tr_prediction_df = pd.concat([cum_tr_prediction_df, all_tr_prediction_df])
            all_ts_fold_info_df = pd.concat([all_ts_fold_info_df, all_tr_fold_info_df])
            
            cum_tr_scores_df.reset_index(drop=True, inplace=True) 
            cum_tr_prediction_df.reset_index(drop=True, inplace=True) 
            all_ts_fold_info_df.reset_index(drop=True, inplace=True) 
            
#             print("TTTT", best_tr_model.keys(), best_tr_model, all_tr_scores_df.shape, all_tr_scores_df.columns, all_tr_prediction_df.shape, all_tr_prediction_df.columns) 
                        
            ###############
            ### Model evaluation with the test data using the best trained model  
            all_ts_model[ind] = best_tr_model
            ts_score_df, ts_prediction_df = self.evaluate_test_data(ind, model_list, test_ids, best_tr_model) 
            
            all_ts_scores_df = pd.concat([all_ts_scores_df, ts_score_df]) 
            all_ts_prediction_df = pd.concat([all_ts_prediction_df, ts_prediction_df])
            
            all_ts_scores_df.reset_index(drop=True, inplace=True) 
            all_ts_prediction_df.reset_index(drop=True, inplace=True) 
        
            self.logger.info(f"""
            ===================================================================================================
            TEST {ind} END...
            """) 
            
        ### Sorting scores
#             print( 'TTRR', cum_tr_scores_df.columns.values.tolist(), cum_tr_prediction_df.columns.values.tolist() )
        cum_tr_scores_df.sort_values(['Model_No', 'Test_No', 'Training_No'], ascending = [True, True, True], inplace=True)  
        cum_tr_prediction_df.sort_values(['Test_No', 'Training_No'], ascending = [True, True], inplace=True)  

#             print( 'TTSS', all_ts_scores_df.columns.values.tolist(), all_ts_prediction_df.columns.values.tolist() )
        all_ts_scores_df.sort_values(['Model_No', 'Test_No'], ascending = [True, True], inplace=True) 
        all_ts_prediction_df.sort_values(['Test_No'], ascending = [True], inplace=True) 
        all_ts_fold_info_df.sort_values(['Model_No', 'Test_No', 'Training_No'], ascending = [True, True, True], inplace=True) 
        
        all_ts_fold_info_df
        
        return cum_best_tr_model, cum_tr_model, cum_tr_scores_df, cum_tr_prediction_df, all_ts_model, all_ts_scores_df, all_ts_prediction_df, all_ts_fold_info_df  
        
    
    
    
    def train(self, tsi, model_list, train_all_ids, tr_splitting_crieteria):   
        # tr_splitter = self.get_data_splitter(tr_splitting_crieteria, stratified=True, custom=True) #Training split customised: usually 5-fold or Random 20% split
        tr_splitter = self.get_data_splitter(tr_splitting_crieteria, stratified=True, custom=False) #Training split: usually 5-fold or Random 20% split
        all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_tr_fold_info_df = {}, pd.DataFrame(), pd.DataFrame(), pd.DataFrame()  
        best_tr_model, best_acc, best_rec, best_prec = {}, [], [], []  
        class_data_list = self.dataset.iloc[train_all_ids][self.class_name].values.tolist() 
        split_data_list = self.dataset.iloc[train_all_ids][self.split_column].values.tolist() 
        # print('QQQQQQQQQQQ', split_data_list, class_data_list) 
        for tri, (train_ids, val_ids) in enumerate(tr_splitter.split(split_data_list, class_data_list)): 
        # for tri, (train_ids, val_ids) in enumerate(tr_splitter.split(split_data_list)): 
            ### Validation-train split: random percentage based 
            val_dat = self.dataset[self.split_column][val_ids].values.tolist() 
            tr_dat = self.dataset[self.split_column][train_ids].values.tolist() 
            ind = tri+1 
            self.logger.info(f"""
            *** MODEL TRAINING AND VALIDATION PHASE FOR TEST {tsi} 
            TRAINING {tri+1} START... XXXXX 
            ***************************************************************************************************
            Validation=> {len(val_dat)} {(val_dat)} 
            Training=> {len(tr_dat)} {(tr_dat)} 
            """)            
            all_mtr_model, all_mtr_scores_df, all_mtr_prediction_df, all_mtr_fold_info_df = self.train_models(model_list, train_ids, val_ids, tsi, ind)            
            all_tr_model[ind] = all_mtr_model 
            if self.is_validate_models:
                all_mtr_scores_df.insert(0, "Training_No", ind) 
                all_mtr_prediction_df.insert(0, "Training_No", ind) 
            all_mtr_fold_info_df.insert(0, "Training_No", ind) 
                
            all_tr_scores_df = pd.concat([all_tr_scores_df, all_mtr_scores_df]) 
            all_tr_prediction_df = pd.concat([all_tr_prediction_df, all_mtr_prediction_df])
            all_tr_fold_info_df = pd.concat([all_tr_fold_info_df, all_mtr_fold_info_df])
            
            all_tr_scores_df.reset_index(drop=True, inplace=True) 
            all_tr_scores_df.reset_index(drop=True, inplace=True)    
            all_tr_fold_info_df.reset_index(drop=True, inplace=True)          
            
            self.logger.info(f"""
            ---------------------------------------------------------------------------------------------------
            Best model index calculation  
            """)             
#             print("PPPPP", tri, ind, all_mtr_model.keys(), all_mtr_model, all_mtr_scores_df)
            if tri==0:
                best_tr_model = all_mtr_model.copy() 
                #print("HHHH", all_mtr_scores_df.columns)
                # best_acc, best_prec, best_rec = all_mtr_scores_df[ML_Performace_Metrics.ACC.value], all_mtr_scores_df[ML_Performace_Metrics.PREC.value], all_mtr_scores_df[ML_Performace_Metrics.RECL.value] 
                best_rec = all_mtr_scores_df[ML_Performace_Metrics.RECL.value].values.tolist()  
            else:                
                for jj, mn in enumerate(model_list):
                    mod_name = ML_Classifiers.get_short_form(str(mn.value))
                    tm_df = all_mtr_scores_df[(all_mtr_scores_df["Model_Name"]==mod_name)] 
                    new = tm_df[ML_Performace_Metrics.RECL.value].values.tolist()[0] 
                    if new>best_rec[jj]: 
                        best_rec[jj] = new 
                        best_tr_model[tm_df["Model_No"].values.tolist()[0]] = all_mtr_model[tm_df["Model_No"].values.tolist()[0]]  
                        
            
            self.logger.info(f"""
            ***************************************************************************************************
            TRAINING {ind} END... 
            """) 
        return best_tr_model, all_tr_model, all_tr_scores_df, all_tr_prediction_df, all_tr_fold_info_df 
    
    
    
    def train_models(self, model_list, train_ids, val_ids, ts_serial, tr_serial):
        all_mtr_model, all_mtr_scores_df, all_mtr_prediction_df, all_mtr_fold_info_df = {}, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() 
        for modi, classifier_method in enumerate(model_list): 
            ind = modi+1 
            self.logger.info(f"""
            *** ML MODEL FOR TEST:{ts_serial}, TRAINING:{tr_serial} AND MODEL: {ML_Classifiers.get_short_form(str(classifier_method.value))} 
            ---------------------------------------------------------------------------------------------------
            """)
            mtr_model, mtr_scores_df, mtr_prediction_df, mtr_fold_info_df = self.start_training(classifier_method, train_ids, val_ids, tr_serial=tr_serial) 
            #print("HELLO2222", mtr_model, mtr_scores_df, mtr_prediction_df) 
            all_mtr_model[ind] = mtr_model 
            #all_mtr_model["Model_Name"] = ML_Classifiers.get_short_form(str(classifier_method.value))  
            if self.is_validate_models:                   
                if modi>0:
                    mtr_prediction_df.drop(self.metadata_column, axis=1, inplace=True)
                    mtr_prediction_df.drop([self.class_name], axis=1, inplace=True)                
                               
                mtr_scores_df.insert(0, "Model_No", ind) 
                mtr_scores_df.insert(1, "Model_Name", ML_Classifiers.get_short_form(str(classifier_method.value)))  
#                 mtr_scores_df.insert(2, "Selected_Features", [self.selected_features]*mtr_scores_df.shape[0]) 
                    
                all_mtr_scores_df = pd.concat([all_mtr_scores_df, mtr_scores_df]) 
                all_mtr_prediction_df = pd.concat([all_mtr_prediction_df, mtr_prediction_df], axis=1) 
                
                all_mtr_scores_df.reset_index(drop=True, inplace=True) 
                all_mtr_prediction_df.reset_index(drop=True, inplace=True)

            mtr_fold_info_df.insert(0, "Model_No", ind)    
            mtr_fold_info_df.insert(1, "Model_Name", ML_Classifiers.get_short_form(str(classifier_method.value))) 
            all_mtr_fold_info_df = pd.concat([all_mtr_fold_info_df, mtr_fold_info_df]) 
            all_mtr_fold_info_df.reset_index(drop=True, inplace=True)  
                    
        #print("HELLO", all_mtr_model, all_mtr_scores_df, all_mtr_prediction_df) 
        return all_mtr_model, all_mtr_scores_df, all_mtr_prediction_df, all_mtr_fold_info_df
    
    
    
    def get_data_splitter(self, split_crit_tuple, stratified=False, custom=False):
        
        if custom:
            self.logger.info(f"Custom splitter testing...") 
#             splitter = MyCustomSplitter(n_splits=5)
#             splitter.set_criteria(split_crit_tuple) 
            splitter = MyCustomSplitter(split_crit_tuple, groups=self.split_balance_pattern) ### , groups=[['n'], ['SC', 'ST']] for binary and [['n'], ['SC', 'ST']] for multi-class 
            return splitter
        
        spl_rand = self.random_state_value ##random.randint(1, 1000)
        splitter = None
        split_crit = split_crit_tuple[0] ### Fold 
        split_perc = split_crit_tuple[1] ### Fold 
        
        if split_crit==0:
            self.logger.info(f"Leave-one-subject-out testing...") 
            split_num = 1 
            splitter = LeavePOut(p=split_num) if stratified else LeavePOut(p=split_num) 
            # splitter = StratifiedLeavePOut(p=split_num) if stratified else LeavePOut(p=split_num) 
            # splitter = LeavePOut(p=split_num) 
            # splitter = StratifiedLeavePOut(p=split_num) #Stratified
        elif split_crit>0:
            if split_perc<=0:
                self.logger.info(f"{split_crit}-fold testing") 
                split_num = 5
                if split_crit != split_num:
                    split_num = split_crit 
                splitter = StratifiedKFold(n_splits=split_num, shuffle=False) if stratified else KFold(n_splits=split_num, shuffle=False) 
                # splitter = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=spl_rand) if stratified else KFold(n_splits=split_num, shuffle=True, random_state=spl_rand) 
                # splitter = KFold(n_splits=split_num, shuffle=True, random_state=spl_rand)
                # splitter = StratifiedKFold(n_splits=split_num, shuffle=True, random_state=spl_rand)
                #splitter = KFold(n_splits=split_num, random_state=spl_rand)
                #splitter = KFold(n_splits=split_num)
            else:
                split_num = split_crit 
                split_ratio = split_perc 
                self.logger.info(f"Random {split_ratio} percentage splitting testing...") 
                splitter = StratifiedShuffleSplit(n_splits=split_num, test_size=split_ratio, random_state=spl_rand) if stratified else ShuffleSplit(n_splits=split_num, test_size=split_ratio, random_state=spl_rand) 
                # splitter = ShuffleSplit(n_splits=split_num, test_size=split_ratio, random_state=spl_rand) #rs
                # splitter = StratifiedShuffleSplit(n_splits=split_num, test_size=split_ratio, random_state=spl_rand) #rs
                # splitter = ShuffleSplit(n_splits=split_num, test_size=split_ratio) #rs
        else:
            self.logger.info(f"Problem with the splitting with the splitting criteria {split_crit_tuple}...") 
            
        # self.data_splitter = splitter 
        return splitter 
    
    

    def start_training(self, classifier_method, train_ids, val_ids, tr_serial):
        parameters = self.get_parameters_for_ml_models(classifier_method) 
        print("Parameters: ", parameters)
        model, model_scores, target_and_prediction = None, None, None
        model, model_scores, target_and_prediction, fold_info_df = self.call_all_model_optimization(classifier_method, parameters, train_ids, val_ids, tr_serial, parameter_optimization=1)
        return model, model_scores, target_and_prediction, fold_info_df 


    
    def _get_data_from_indices(self, indices, from_training=False):
        data = copy.deepcopy(self.dataset).iloc[indices] 
        ### Downsample he raining data: 1=down, 2=up, 3=bound sampling
        #if from_training:
        #    self.logger.info(f'Resampling training data...')
        #    data = self.preprocessor.get_resamplled_data(data, self.class_name, self.pat_id_col, random_sampling=True, up_or_down_sampling=1, min_scale=2.0, max_scale=3.0) ## 0-no, 1-down, 2-up, 3-bound
                    
        self.logger.info(f"""
        From training? {from_training}, Data shape: {data.shape}, Indices: {indices}
        All Columns: {data.columns.values.tolist()}
        """) 
        
        target = data[self.class_name] 
        metadata_df = data[self.metadata_column] 
        features = data.drop([self.class_name]+self.metadata_column, axis=1) 
        
        ### Call features selection algorithm 
        if self.apply_feature_selection and from_training and self.selected_features is None: 
            self.logger.info(f""""
            Here comes to feature selection...
            """) 
            selected_feats_list = self.select_appropriate_features(features, target, num_features=None, selection_criteria={'auc':0.7}) ### selection_criteria=None/{'auc':0.7} #'corr', 'p', 'auc', 'pandauc' 
            self.selected_features = selected_feats_list.copy() 
            self.logger.info(f""""
            Selected features: {self.selected_features}
            """) 
        elif from_training and self.selected_features is None: 
            self.selected_features = features.columns.values.tolist() 
            self.logger.info(f""""
            Selected features: {self.selected_features}
            """) 
            
        if self.selected_features is not None:
            # features = features.loc[:, ~features.columns.isin(self.selected_features) ]
            features = features.loc[:, features.columns.isin(self.selected_features) ]
            
        self.logger.info(f"""
        Feature shape: {features.shape}, Target shape: {target.shape}, Metadata: {metadata_df.shape} 
        """) 
        
        target = target.values.tolist() 
        features = features.values 
        
        return features, target, metadata_df  
    
    
    
    def select_appropriate_features(self, X_dat, y_dat, num_features=None, selection_criteria=None): ### selection_criteria={'auc':0.7} #'corr', 'p', 'auc', 'pandauc' 
        selected_features = [] 
        crit_name = selection_criteria.keys() 
        crit_name = list(crit_name)[0] 
        dpp_obj = DataPreprocessor() 
        
        if crit_name=='corr':
            pass
        elif crit_name=='p':
            selected_features = dpp_obj.select_pandauc_based_features(X_dat, y_dat, binary_class=self.is_binary_classification, p_threshold=0.05, auc_threshold=None, sort=True) 
        elif crit_name=='auc':
            selected_features = dpp_obj.select_pandauc_based_features(X_dat, y_dat, binary_class=self.is_binary_classification, p_threshold=None, auc_threshold=0.7, sort=True) 
            pass
        elif crit_name=='pandauc':
            selected_features = dpp_obj.select_pandauc_based_features(X_dat, y_dat, binary_class=self.is_binary_classification, p_threshold=0.05, auc_threshold=0.7, sort=True) 
            
        if num_features:
            selected_features = selected_features[:num_features]
        
        return selected_features 
    
    

    def run_model_gridSearch(self, classifier_method, params, train_ids, val_ids, tr_serial):
        tmp_train_ids, tmp_val_ids = train_ids.copy(), val_ids.copy()  
        should_validate = self.is_validate_models 
        # should_validate = True 
        
        if not should_validate:
            # tmp_train_ids.extend(tmp_val_ids) 
            tmp_train_ids = np.concatenate((tmp_train_ids, tmp_val_ids)) 
        
        ### Validation-train split: random percentage based 
        val_dat = self.dataset[self.split_column][val_ids].values.tolist() 
        tr_dat = self.dataset[self.split_column][train_ids].values.tolist() 
        # print('HHHHHH', val_dat, tr_dat) 
        fold_info_df = pd.DataFrame([[val_dat, tr_dat]], columns=['Validation', 'Training']) 
        
        X_train, y_train, _ = self._get_data_from_indices(tmp_train_ids, from_training=True) 
        
        mods = self.get_ml_model_instances(classifier_method)
        self.logger.info(f"""
        GridSearch: {ML_Classifiers.get_short_form(str(classifier_method.value))} - {params} 
        ---------------------------------------------------------------------------------------------------
        """)
        parameters = {}
        model = mods
        model_scores = None
        if self.should_use_params:
            parameters = params

        scoring, refit = self.get_ml_scoring_metrices(self.best_model_scoring_metrics[0]) 
        scoring, refit = 'f1', True
        self.logger.info(f"""Refitting the model with best parameter {scoring} == {refit}""")
        
        model = GridSearchCV(mods, parameters, scoring=scoring, cv=5, refit=refit, return_train_score=True, n_jobs=50, verbose=2)
        # model = GridSearchCV(mods, parameters, scoring=scoring, cv=5, refit=refit, return_train_score=True, error_score='raise', n_jobs=50, verbose=2)
        # model = GridSearchCV(mods, parameters, scoring=scoring, cv=5, refit=refit, return_train_score=True, n_jobs=mp.cpu_count(), verbose=2)

        # ### Scoring from custom method
        # score = make_scorer(self.custom_precision_func, greater_is_better=False)
        # # scoring = {'precision': score, 'f1':make_scorer(f1_score)}
        # model = GridSearchCV(mods, parameters, scoring=score, cv=self.cross_validation_rounds, refit=refit, return_train_score=True, n_jobs=-1, verbose=2)
        
        # X_train = np.nan_to_num(X_train)
        model = model.fit(X_train, y_train) 
        mod = copy.deepcopy(model) 
        mod_est = model.best_estimator_ 
        mod_par = model.best_params_
        
        # print('KKKKKKKKK-->>>', model, mod_est, mod_par)
        model_scores = None 
        target_and_prediction_df = pd.DataFrame() 
        
#         ### Rebuild the model with best parameter         
#         # if should_validate:
#         bst_parameters = model.best_params_
#         self.logger.info(f"""Refitting the model with best parameter""")
#         # mod = mod.set_params(**bst_parameters)
#         mod = mod.best_estimator_.set_params(**bst_parameters)
#         tmp_train_ids2 = np.concatenate((train_ids.copy(), val_ids.copy()))
#         # X_train2, y_train2, _ = self._get_data_from_indices(tmp_train_ids2, from_training=True)  
#         X_train2, y_train2, _ = self._get_data_from_indices(tmp_train_ids2, from_training=False)    
#         mod = mod.fit(X_train2, y_train2)
            
        X_val, y_val, meta_dat = self._get_data_from_indices(val_ids)   
        y_pred = mod.predict(X_val)  
        y_pred_proba = mod.predict_proba(X_val) 
        target_and_prediction_df.reset_index(drop=True, inplace=True) 
        meta_dat.reset_index(drop=True, inplace=True) 
        target_and_prediction_df = pd.concat([target_and_prediction_df, meta_dat]) 
        target_and_prediction_df[self.class_name] = y_val 
        # target_and_prediction_df[f"Prediction_{str(model.__class__.__name__)}"] = y_pred 
        target_and_prediction_df[f"Prediction_{ML_Classifiers.get_short_form(str(classifier_method.value))}"] = y_pred  
        for p_ind in range(y_pred_proba.shape[1]):
            target_and_prediction_df[f"Prediction_Proba_{p_ind}_{ML_Classifiers.get_short_form(str(classifier_method.value))}"] = y_pred_proba[:, p_ind].tolist()
            
        model.feature_names = self.selected_features 
        model_scores = self.calculate_model_scores(model, y_val, y_pred, y_pred_proba) 

        self.logger.info(f"""
        Best model (GriveSearchCV): {model} 
        Best model: {mod} 
        Best estimator of the model: {mod_est} 
        Best parameters of the model: {mod_par} 
        ---------------------------------------------------------------------------------------------------
        """)

        return model, model_scores, target_and_prediction_df, fold_info_df 
        # return model, model_scores, target_and_prediction_df, fold_info_df


    def call_all_model_optimization(self, classifier_method, parameters, train_ids, val_ids, tr_serial, parameter_optimization):
        model, model_scores, target_and_prediction, fold_info_df = None, None, None, None 
        if parameter_optimization == 1:
            model, model_scores, target_and_prediction, fold_info_df = self.run_model_gridSearch(classifier_method, parameters, train_ids, val_ids, tr_serial)
        elif parameter_optimization == 2:
            model, model_scores, target_and_prediction, fold_info_df = self.run_model_randomizedSearch(classifier_method, parameters, train_ids, val_ids, tr_serial)
        elif parameter_optimization == 3:
            model, model_scores, target_and_prediction, fold_info_df = self.run_model_baysianSearch(classifier_method, parameters, train_ids, val_ids, tr_serial)
        elif parameter_optimization == 4:
            model, model_scores, target_and_prediction, fold_info_df = self.run_model_customGridSearch(classifier_method, parameters, train_ids, val_ids, tr_serial)
        return model, model_scores, target_and_prediction, fold_info_df


    def get_ml_model_instances(self, classifier_method, parameters=None):
        classifier = None

        ### GPU code START
        global GPUs
        global HAS_GPU

        # GPUs = GPUtil.getGPUs()
        # tot_gpus = len(GPUs)
        # HAS_GPU = True if len(GPUs) > 0 else False
        # avl_GPUIDs = GPUtil.getAvailable(order = 'first', limit = tot_gpus, maxLoad = 0.5, maxMemory = 0.5, includeNan=False, excludeID=[], excludeUUID=[])
        # tot_avl_gpus = len(avl_GPUIDs)
        # print(f'For GPU based tasks. There are {tot_gpus} GPUs in the system and {tot_avl_gpus} are available. \nAvailable GPU IDs are: {avl_GPUIDs}')
        allGPUs, bestGPU = HumachLab_Global.get_gpu_details(show_logs=False)
        ### GPU code END

        # ####### rf #######
        # rf - random_forest classifier
        if classifier_method == ML_Classifiers.RF:
            classifier = RandomForestClassifier() if (parameters is None) else RandomForestClassifier(parameters)
        # ####### knn #######
        # knn - k_neares_neighbours classifier
        elif classifier_method == ML_Classifiers.kNN:
            classifier = KNeighborsClassifier() if (parameters is None) else KNeighborsClassifier(parameters)
        # ####### nb #######
        # knn - naieve bayes classifier
        elif classifier_method == ML_Classifiers.NB:
            classifier = GaussianNB() if (parameters is None) else GaussianNB(parameters)
        # ####### svm/svc #######
        # knn - support vector classifier
        elif classifier_method == ML_Classifiers.SVC:
            classifier = SVC() if (parameters is None) else SVC(parameters)
        # ####### knn #######
        # knn - k_neares_neighbours classifier
        elif classifier_method == ML_Classifiers.DT:
            classifier = DecisionTreeClassifier() if (parameters is None) else DecisionTreeClassifier(parameters)
        # ####### LogReg #######
        # LogReg - logistic regression classifier
        elif classifier_method == ML_Classifiers.LogReg:
            classifier = LogisticRegression() if (parameters is None) else LogisticRegression(parameters)
        # ####### GBoost #######
        # GBoost - gradient boosting classifier
        elif classifier_method == ML_Classifiers.GBoost:
            classifier = GradientBoostingClassifier() if (parameters is None) else GradientBoostingClassifier(parameters)

        ### GPU code - Comment it if no gpu available or not linux system or no support for RapidsAI package
        # ####### gpu-rf #######
        # gpu-rf - gpu-random_forest classifier
        # elif classifier_method == ML_Classifiers.GPURF and tot_avl_gpus>0:
        #     classifier = gpuRandomForestClassifier() if (parameters is None) else gpuRandomForestClassifier(parameters)

        # ####### None #######
        # No classifier
        else:
            self.logger.info(f'No classifier is selected...')

        # ####### ####### #######
        return classifier


    def get_ml_scoring_metrices(self, reft=None):
        model_scoring_mets = [ML_Performace_Metrics.ACC, ML_Performace_Metrics.PREC, ML_Performace_Metrics.RECL,
                              ML_Performace_Metrics.SEN, ML_Performace_Metrics.SPEC, ML_Performace_Metrics.FPR,
                              ML_Performace_Metrics.FNR, ML_Performace_Metrics.F1, ML_Performace_Metrics.ROC_AUC]

        scoring = [ML_Performace_Metrics.ACC.value]
        bst_mod_mets_1 = None
        i = 0
        for met in self.best_model_scoring_metrics:
            if i==0:
                scoring.clear()
                if (reft is not None):
                    if reft == ML_Performace_Metrics.F1SCR:
                        reft = ML_Performace_Metrics.F1
                    if (reft not in model_scoring_mets):
                        reft = None

            if met == ML_Performace_Metrics.F1SCR:
                met = ML_Performace_Metrics.F1

            if met in model_scoring_mets:
                scoring.append(met.value)
            i += 1

        refit = (scoring[0]) if reft is None else reft.value
        
#         scoring = [ML_Performace_Metrics.F1]
#         refit = True

        return scoring, refit


    ############################################################################
    def get_parameters_for_ml_models(self, classifier_method):
        parameters = {}
        if not self.should_use_params:
            return parameters

        # Parameter generation method name
        method_name = f'{str(classifier_method.value)}_parameters'

        try:
            method = getattr(self, method_name)
            # Call method for parameter generation
            self.logger.info(f'Calling method: {method_name}')
            parameters = method()
        except AttributeError:
            self.logger.warning(f'No such method exists with the name: {method_name}')
            raise NotImplementedError("Class `{}` does not implement `{}`".format(self.__class__.__name__, method_name))

        # ####### ####### #######
        return parameters




    ############################################################################
    def generate_parameter_dictionary(self, par_names, par_vals, par_ind):
        self.logger.info(f'All parameters: {par_names}, {par_vals}, {par_ind}')
        final_par_names = []
        par_dict = {}

#         for i in par_ind:
#             pn = par_names[i]
#             pv = par_vals[i]
#             exec(f'{pn}={pv}')
#             final_par_names.append(pn)
        
        sel_par = [pp for ii,pp in enumerate(par_names) if ii in par_ind] 
        for (pn, pv) in zip(sel_par, par_vals):
            exec(f'{pn}={pv}')
            final_par_names.append(pn)

        for par in final_par_names:
            par_dict[par] = eval(par)

        return par_dict


    # def float_range(self, start, stop, step):
    #     start = decimal.Decimal(start)
    #     stop = decimal.Decimal(stop)
    #     while start < stop:
    #         yield float(start)
    #         start *= decimal.Decimal(step)


    # #########################################################################
    # Model parameter settings
    # #########################################################################
    # ### ML Classifier Method Parameters
    
    
    
    def logistic_regression_parameters(self):

        # ### Parameter generation using function
        par_names = ['penalty', 'solver', 'max_iter', 'C']
        par_vals = [['l1', 'l2', 'elasticnet'],
                    ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                    list(range(50, 5000, 10)),
                    list(HumachLab_StaticMethods.float_range('0.001', '1.0', '0.01'))]

        par_vals = [[30, 50, 75, 100, 200, 500, 750, 1000], [2, 3, 5, 7], [5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[15, 21, 30, 50, 75, 100, 200, 500]]
        par_vals = [[50, 75, 100]]
        par_vals = [[15, 21, 30, 50, 75, 100, 200, 500], [3, 5, 7, 10, 15, 20, 25, 30]]
        par_vals = [['l1', 'l2', 'elasticnet'],[50, 100, 130, 150, 170, 200, 250, 350, 500, 750, 1000]]
        par_vals = [['l1', 'l2', 'elasticnet'],[50, 100, 130, 150, 170, 200],[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]]
        par_vals = [['l2', 'elasticnet'],[50, 100, 130, 150, 170, 200],[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]]
        par_ind = [0, 2, 3]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def k_nearest_neighbors_parameters(self):

        # ### Parameter generation using function
        par_names = ['n_neighbors', 'p', 'metric', 'n_splits']
        par_vals = [list(range(2, 100)),
                    list(range(2, 100)),
                    ['manhattan', 'minkowski', 'euclidean'],
                    list(range(2, 10))]

        par_vals = [list(range(100, 1000, 50)), list(range(2, 11, 1)), [2, 3, 5, 9, 13, 19, 29]]
        par_vals = [[2, 3, 5, 9, 13, 19, 29]]
        par_vals = [[2, 3, 5, 10, 15, 25, 35], ['manhattan', 'minkowski', 'euclidean']]
        par_ind = [0, 2]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def naive_bayes_parameters(self):

        # ### Parameter generation using function
        par_names = ['var_smoothing']
        par_vals = [list(np.logspace(0, -9, num=100))]
        par_vals = [list(np.logspace(0, -9, num=100))]

        # par_vals = []
        # par_vals = []
        par_vals = [list(np.logspace(0,-9, num=5))]
        par_ind = [0]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters


    def support_vector_classifier_parameters(self):

        # ### Parameter generation using function #, probability=True
        par_names = ['probability', 'C', 'kernel', 'gamma', 'degree', 'class_weightdict']
        par_vals = [[True],
                    list(HumachLab_StaticMethods.float_range('0.001', '1', '0.01')),
                    ['linear', 'rbf', 'poly', 'sigmoid'],
                    list(HumachLab_StaticMethods.float_range('0.000001', '1', '10')),
                    list(range(1, 10)),
                    [None, 'balanced']]

        # par_vals = [list(HumachLab_StaticMethods.float_range('0.000001', '1', '10')), list(HumachLab_StaticMethods.float_range('0.00001', '1', '10')), list(HumachLab_StaticMethods.float_range('0.0001', '1', '10'))]
        par_vals = [list(HumachLab_StaticMethods.float_range('0.001', '1.', '0.1')), ['linear', 'rbf', 'poly']]
        par_vals = [[True],[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0], ['linear', 'rbf', 'poly']]
        par_ind = [0, 1, 2]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def decision_tree_parameters(self):

        # ### Parameter generation using function
        par_names = ['max_depth', 'criterion', 'splitter', 'min_samples_split', 'min_samples_leaf', 'max_leaf_nodes']
        par_vals = [list(range(1, 100)),
                    ['gini', 'entropy', 'log_loss'],
                    ['best', 'random'],
                    list(range(1, 10)),
                    list(range(1, 10)),
                    list(range(1, 100))]

        par_vals = [list(range(1, 100)), list(range(1, 100, 2)), list(range(1, 100, 3))]
        par_vals = [list(range(1, 100))]
        par_vals = [[2, 3, 5, 7, 10, 15, 20, 25, 30], ['gini', 'entropy', 'log_loss']]
        par_ind = [0, 1]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def random_forest_parameters(self): 

        # ### Parameter generation using function
        par_names = ['n_estimators', 'max_depth', 'criterion', 'splitter', 'max_features', 'min_samples_split', 'min_samples_leaf', 'max_leaf_nodes']
        par_vals = [list(range(1, 500, 5)),
                    list(range(1, 100)),
                    ['gini', 'entropy', 'log_loss'], 
                    ['best', 'random'],
                    list(range(2, 20, 1)),
                    list(range(1, 10)),
                    list(range(1, 10)),
                    list(range(1, 100))]

        par_vals = [[30, 50, 75, 100, 200, 500, 750, 1000], [2, 3, 5, 7], [5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[15, 21, 30, 50, 75, 100, 200, 500]]
        par_vals = [[50, 75, 100]]
        par_vals = [[3, 5, 10, 15, 21, 30, 50, 75, 100], [2, 3, 5, 7, 10, 15, 20, 25, 30], ['gini', 'entropy', 'log_loss']]
        # par_vals = [[2, 3, 5, 7]]
        par_ind = [0, 1, 2]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def graphics_processing_unit_random_forest(self):

        # ### Parameter generation using function
        par_names = ['n_estimators', 'n_bins', 'n_streams', 'max_depth', 'max_features', 'criterion', 'splitter', 'min_samples_split', 'min_samples_leaf', 'max_leaf_nodes']
        par_vals = [list(range(1, 500, 5)),
                    list(range(1, 100)),
                    list(range(2, 20, 1)),
                    ['gini', 'entropy', 'log_loss'], 
                    ['best', 'random'],
                    list(range(1, 10)),
                    list(range(1, 10)),
                    list(range(1, 100))]

        par_vals = [[30, 50, 75, 100, 200, 500, 750, 1000], [2, 3, 5, 7], [5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000], 15, 8]
        # par_vals = [[2, 3, 5, 7]]
        par_ind = [0, 1, 2]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    
    
    def gradient_boosting_parameters(self):

        # ### Parameter generation using function 
        par_names = ['n_estimators', 'max_depth', 'learning_rate', 'max_features', 'loss', 'min_samples_split', 'min_samples_leaf']
        par_vals = [list(range(1, 500, 5)),
                    list(range(1, 100)),
                    list(HumachLab_StaticMethods.float_range('0.001', '1', '0.01')), 
                    list(range(2, 20, 1)),
                    ['log_loss', 'exponential'],
                    list(range(1, 10)),
                    list(range(1, 10))]

        par_vals = [[30, 50, 75, 100, 200, 500, 750, 1000], [2, 3, 5, 7], [5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[5, 7, 11, 15, 21, 30, 50, 75, 100, 200, 500, 750, 1000]]
        par_vals = [[15, 21, 30, 50, 75, 100, 200, 500]]
        par_vals = [[50, 75, 100]]
        par_vals = [[15, 21, 30, 50, 75, 100, 200, 500], [3, 5, 7, 10, 15, 20, 25, 30], [0.01, 0.05, 0.1, 0.5, 1.0]]
        par_vals = [[3, 5, 10, 15, 21, 30, 50, 75, 100], [2, 3, 5, 7, 10, 15, 20, 25, 30], [0.01, 0.05, 0.1, 0.5, 1.0]]
        # par_vals = [[2, 3, 5, 7]]
        par_ind = [0, 1, 2]
        parameters = self.generate_parameter_dictionary(par_names, par_vals, par_ind)

        return parameters
    
    


    # #########################################################################
    # Calculate and save classification details and model scores
    # #########################################################################
    #############################

    def calculate_model_scores(self, mods, y_test, y_pred, y_pred_proba): 
#         print(y_test, '\n', y_pred, '\n', y_pred_proba, '\n')
#         target_labels = np.unique(np.array(y_test)).tolist() 
        target_labels = sorted( self.dataset[self.class_name].unique().tolist() )
        
        y_pred = y_pred.tolist() 
        perf_scores = self.calculate_performance_scores(y_test, y_pred, y_pred_proba, labels=target_labels)  # average = 'weighted', 'macro', 'micro' 
        confMat = perf_scores['Conf_Mat']

        acc = round(perf_scores['ACC'], 3) #Accuracy score or total correct prediction rate 
        prec = round(perf_scores['PREC'], 3) #precision or positive predictive value (PPV)
        reca_sens = round(perf_scores['REC'], 3) #sensitivity, recall, hit rate, or true positive rate (TPR)
        spec = round(perf_scores['SPE'], 3) #specificity, selectivity or true negative rate (TNR)
        f1sc = round(perf_scores['F1SCR'], 3)
        auc_s = round(perf_scores['AUC'], 3) 
        
        scr_dict = {'method': str(mods), 'model': mods.best_estimator_, 'model_parameters': mods.best_params_, 
                    'model_scores': round(mods.best_score_*100,2),
                    ML_Performace_Metrics.CONF_MAT.value: confMat, ML_Performace_Metrics.ACC.value: acc, ML_Performace_Metrics.PREC.value: prec,
                    ML_Performace_Metrics.RECL.value: reca_sens, ML_Performace_Metrics.SEN.value: reca_sens, ML_Performace_Metrics.SPEC.value: spec,
                    ML_Performace_Metrics.F1SCR.value: f1sc, ML_Performace_Metrics.ROC_AUC.value: auc_s}
        
#         scr_dict = {'method_class': str(mods.__class__.__name__), 'model_name': mods.best_estimator_, 'model_parameters': mods.best_params_, 
#                     'model_scores': round(mods.best_score_*100,2),
#                     ML_Performace_Metrics.CONF_MAT.value: confMat, ML_Performace_Metrics.ACC.value: acc, ML_Performace_Metrics.PREC.value: prec,
#                     ML_Performace_Metrics.RECL.value: reca_sens, ML_Performace_Metrics.SEN.value: reca_sens, ML_Performace_Metrics.SPEC.value: spec,
#                     ML_Performace_Metrics.F1SCR.value: f1sc, ML_Performace_Metrics.ROC_AUC.value: auc_s}
#         scr_dict = {'model_class': str(mods.__class__.__name__), 'method_name': mods.estimator, 'method_parameters': mods.best_params_, 
#                     'method_scores': round(mods.best_score_*100,2),
#                     ML_Performace_Metrics.CONF_MAT.value: confMat, ML_Performace_Metrics.ACC.value: acc, ML_Performace_Metrics.PREC.value: prec,
#                     ML_Performace_Metrics.RECL.value: reca_sens, ML_Performace_Metrics.SEN.value: reca_sens, ML_Performace_Metrics.SPEC.value: spec,
#                     ML_Performace_Metrics.FPR.value: fpr, ML_Performace_Metrics.FNR.value: fnr, ML_Performace_Metrics.F1SCR.value: f1sc, ML_Performace_Metrics.ROC_AUC.value: auc_s}
#         scr_dict = {'model_class': str(mods.__class__.__name__), 'method_name': str(mods), 'method_parameters': mods.best_params_, 
#                     'method_scores': round(mods.best_score_*100,2),
#                     ML_Performace_Metrics.CONF_MAT.value: confMat, ML_Performace_Metrics.ACC.value: acc, ML_Performace_Metrics.PREC.value: prec,
#                     ML_Performace_Metrics.RECL.value: reca_sens, ML_Performace_Metrics.SEN.value: reca_sens, ML_Performace_Metrics.SPEC.value: spec,
#                     ML_Performace_Metrics.FPR.value: fpr, ML_Performace_Metrics.FNR.value: fnr, ML_Performace_Metrics.F1SCR.value: f1sc, ML_Performace_Metrics.ROC_AUC.value: auc_s}
        scr_df = pd.DataFrame([list( scr_dict.values() )], columns=list( scr_dict.keys() )) 
        self.logger.info(f"""Score columns: {scr_df.shape} {scr_df.columns.values.tolist()}""") 

        return scr_df
    
    
    def calculate_performance_scores(self, y_true, y_pred, y_pred_proba, labels=[0, 1], verbose=2, average='weighted'): # average = 'macro', 'micro', 'weighted' 
        #### SOURCES: https://www.youtube.com/watch?v=PCHf_7jBor8 
        # https://www.mariakhalusova.com/posts/2019-04-17-ml-model-evaluation-metrics-p2/ 
        # https://www.mariakhalusova.com/posts/2019-04-11-ml-model-evaluation-metrics-p1/
        # https://www.evidentlyai.com/classification-metrics/multi-class-metrics 
        # https://www.kaggle.com/code/nkitgupta/evaluation-metrics-for-multi-class-classification 

        model_scores = []
        true_label_uniq = np.unique(np.array(y_true)).tolist() 
        print(np.unique(np.array(y_true)), np.unique(np.array(y_pred)))
        print(y_true, y_pred) 
        conf_matrix = confusion_matrix(y_true, y_pred, labels=labels).tolist()
        print(np.array(conf_matrix) )

        ### For micro averaging and binary class 
        conf_matrix_arr = np.array(conf_matrix) 
        one_vs_all_confMat = []     
        for label in labels:
            tp_lbl = conf_matrix_arr[label, label] 
            fp_lbl = np.sum(conf_matrix_arr[:, label])-tp_lbl 
            fn_lbl = np.sum(conf_matrix_arr[label, :])-tp_lbl 
            tn_lbl = np.sum(conf_matrix_arr)-(tp_lbl+fp_lbl+fn_lbl) 
            one_vs_all_confMat.append([tn_lbl, fp_lbl, fn_lbl, tp_lbl]) 
        print(np.array(one_vs_all_confMat)) 

        tn_tot = np.sum( np.array(one_vs_all_confMat)[:, 0] ) 
        fp_tot = np.sum( np.array(one_vs_all_confMat)[:, 1] )  
        fn_tot = np.sum( np.array(one_vs_all_confMat)[:, 2] )  
        tp_tot = np.sum( np.array(one_vs_all_confMat)[:, 3] )

        conf_matrix_tol = [[tn_tot, fp_tot], [fn_tot, tp_tot]] 
        print(np.array(conf_matrix_tol)) 

        if len(labels)==2:
            tn_tot = one_vs_all_confMat[1][0] 
            fp_tot = one_vs_all_confMat[1][1] 
            fn_tot = one_vs_all_confMat[1][2] 
            tp_tot = one_vs_all_confMat[1][3] 
            average = "micro"

        result = [] 
        for label in labels:
            precision, recall, fscore, support = precision_recall_fscore_support( np.array(y_true)==label, np.array(y_pred)==label ) 
            # tmp_fpr, tmp_tpr, tmp_thresholds = roc_curve(np.array(y_true)==label, np.array(y_pred)==label, pos_label=label) 
            tmp_fpr, tmp_tpr, tmp_thresholds = roc_curve(np.array(y_true)==label, np.array(y_pred)==label) 
            auc_score = auc(tmp_fpr, tmp_tpr)*100 

            if label in true_label_uniq: 
                result.append( [label, precision[1], recall[1], recall[1], recall[0], fscore[1], auc_score, support[1]] ) 
            else:
                result.append( [label, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0] ) 

            accuracy = accuracy_score(np.array(y_true)==label, np.array(y_pred)==label)*100 
            if verbose>1:
                print(
                    f'Class-wise info: For multilevel internal scores fo label {label}: \n', 
                    f'Accuracy = {accuracy}\n', 
                    f'Precision = {precision}\n', 
                    f'Recall = {recall}\n', 
                    f'F1 score = {fscore}\n', 
                    f'AUC score = {auc_score}\n', 
                    f'Support = {support}\n', 
                )
        tdf = pd.DataFrame(result, columns=['Label', 'Precision', 'Recall', 'Sensitivity', 'Specificity', 'F1 Score', 'AUC', 'Support']) 

        if average=='macro': #average = "weighted", "macro", "micro" 
            tdf = tdf[['Precision', 'Recall', 'Sensitivity', 'Specificity', 'F1 Score', 'AUC']].apply(lambda col:np.mean(col), axis=0) 
        elif average=='micro':
            prc = (tp_tot / (tp_tot+fp_tot))*100 if (tp_tot+fp_tot)!=0 else 0.0 #precision or positive predictive value (PPV)
            rec = (tp_tot / (tp_tot+fn_tot))*100 if (tp_tot+fn_tot)!=0 else 0.0 #sensitivity, recall, hit rate, or true positive rate (TPR)
            sns = rec #sensitivity same as recall 
            spc = (tn_tot / (tn_tot+fp_tot))*100 if (tn_tot+fp_tot)!=0 else 0.0 #specificity, selectivity or true negative rate (TNR)
            f1s = (2*tp_tot / (2*tp_tot+fp_tot+fn_tot))*100 if (2*tp_tot+fp_tot+fn_tot)!=0 else 0.0 #specificity, selectivity or true negative rate (TNR)  
            auc_s = roc_auc_score(y_true, y_pred) if len(labels)==2 else roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average=average) #multi_class='ovo', 'ovr' 
#             auc_s = 0.5
#             if len(labels)==2:
#                 auc_s = roc_auc_score(y_true, y_pred)  
#             else:
#                 auc_s = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average=average) #multi_class='ovo', 'ovr' 
            tdf = pd.Series([prc, rec, sns, spc, f1s, auc_s], index=['Precision', 'Recall', 'Sensitivity', 'Specificity', 'F1 Score', 'AUC'])  
        else: ## Default = weighted
            class_weights = tdf['Support']/tdf['Support'].sum() 
            tdf = tdf[['Precision', 'Recall', 'Sensitivity', 'Specificity', 'F1 Score', 'AUC']].apply(lambda col:np.sum(col*class_weights), axis=0) 

        acc = accuracy_score(y_true, y_pred)*100 
        tdf_summary = pd.Series([conf_matrix, acc, tdf['Precision'], tdf['Recall'], tdf['Sensitivity'], tdf['Specificity'], tdf['F1 Score'], tdf['AUC']],
                               index=['Conf_Mat', 'ACC', 'PREC', 'REC', 'SEN', 'SPE', 'F1SCR', 'AUC'])

        if verbose>1:
            confMat = tdf_summary['Conf_Mat']
            acc = round(tdf_summary['ACC'], 3) #Accuracy score or total correct prediction rate 
            prec = round(tdf_summary['PREC'], 3) #precision or positive predictive value (PPV)
            reca_sens = round(tdf_summary['REC'], 3) #sensitivity, recall, hit rate, or true positive rate (TPR)
            spec = round(tdf_summary['SPE'], 3) #specificity, selectivity or true negative rate (TNR)
            f1sc = round(tdf_summary['F1SCR'], 3)
            auc_s = round(tdf_summary['AUC'], 3) 
            print(
                f'CLASSIFICATION MERICS:\n',
                f'{"_"*55}\n',
                f'Confusion Matrix: \n{np.array(conf_matrix)}\n',
                f'Accuracy (acc): {acc}\n',
                f'Precision (prc): {prec}\n',
                f'Recall (rec): {reca_sens}\n',
                f'Sensitivity (sns): {reca_sens}\n',
                f'Specificity (spc): {spec}\n',
                f'F1 Score (f1s): {f1sc}\n',
                f'ROC AUC (AUC): {auc_s}',
            )

        return tdf_summary



In [None]:
# from xgboost import XGBClassifier
