In [1]:
import os

subj = "Subj1"
N_STAGES = 9
exp = "exp_feature_selection"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [2]:
%load_ext autoreload
%autoreload 2

import time
import warnings
import itertools
warnings.filterwarnings('ignore')

import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import sklearn.preprocessing
import sklearn.decomposition
import tqdm.contrib.itertools
import sklearn.feature_selection

In [3]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)

In [4]:
params = [ ]

len_st_thr_attempts = [
    [ 0 ],
    [ 20 ],
    [ 40 ],
    [ 60 ],
    # [ 0, 20 ],
    # [ 0, 40 ],
    # [ 0, 60 ],
    # [ 20, 40 ],
    # [ 20, 60 ],
    # [ 40, 60 ],
    # [ 0, 20, 40 ],
    # [ 0, 20, 60 ],
    # [ 0, 40, 60 ],
    # [ 20, 40, 60 ],
    # [ 0, 20, 40, 60 ]
]
for (
    n_clusters_min,
    k_neighbours_min,
    len_st_thr
) in itertools.product(
    range(2, 21, 3), # range(2, 21)
    range(20, 51, 5), # range(20, 51),
    len_st_thr_attempts
):
    for (
        n_clusters_max,
        k_neighbours_max
    ) in itertools.product(
        range(n_clusters_min, 21, 3), # range(n_clusters_min, 21),
        range(k_neighbours_min, 51, 5), # range(k_neighbours_min, 51)
    ):
        k_neighb_max_thr = [ k_neighbours_max ]
        n_cl_max_thr = [ n_clusters_max ]
        
        params.append({
            'scale': False,
            
            'n_clusters_min': n_clusters_min, 'n_clusters_max': n_clusters_max,
            'k_neighbours_min': k_neighbours_min, 'k_neighbours_max': k_neighbours_max,
            'len_st_thr': len_st_thr,

            'n_cl_max_thr': n_cl_max_thr,
            'k_neighb_max_thr': k_neighb_max_thr,
            'n_edge_clusters_min': N_STAGES - 1, 'n_edge_clusters_max': N_STAGES - 1
        })

print(len(params))

3136


In [5]:
N_JOBS = 15

def try_default(features: numpy.ndarray):
    start = time.time()
    target_result, _ = SDA.SDA(n_jobs = N_JOBS, scale = False, verbose = True).apply(features)

    print('Target time:', time.time() - start)
    display(SDA.analytics.best_results(target_result, key = 'Avg-Silh'))

def try_params(features: numpy.ndarray, result_name: str):
    results = [ ]
    for param in tqdm.tqdm(params):
        start = time.time()
        try:
            result, _ = SDA.SDA(**param, n_jobs = N_JOBS, verbose = False).apply(features)
        except Exception as e:
            continue
        end = time.time()

        result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = N_STAGES)
        metrics = SDA.clustquality.cluster_metrics_ground(edges_true, result['St_edges'])
        result['time'] = (end - start)
        results.append(dict(**param, **result, **metrics))
        
    results = pandas.DataFrame(results)
    results.to_csv(f"{subj}/{exp}/{result_name}.csv")
    display(results.head())

### TDA

In [6]:
features_tda = pandas.read_feather(f'{subj}/exp_final_filtered/all_features.feather')
print(features_tda.shape)

features_tda = sklearn.preprocessing.StandardScaler().fit_transform(features_tda)
print(features_tda.shape)

(1046, 3799)
(1046, 3799)


#### UMAP

In [7]:
features_tda_umap = umap.UMAP(n_components = 15, random_state = 42).fit_transform(features_tda)
print(features_tda_umap.shape)

(1046, 15)


In [8]:
try_default(features_tda_umap)

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Target time: 37.80610513687134


Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,40,10,3,Mode,"[0, 246, 554, 1046]",1035.126427,2.420395,0.165579,612.514876,2.050163,0.284673,479.052251,1.713864
1,0,50,10,4,Mode,"[0, 246, 554, 681, 1046]",1044.994333,3.005025,0.137377,566.736494,2.70834,0.379997,483.041788,0.911399
2,0,40,10,5,Mode,"[0, 246, 554, 682, 789, 1046]",882.027438,2.960657,0.137718,491.752452,2.092489,0.415227,472.130874,0.923173
3,0,35,10,6,Median,"[0, 101, 246, 554, 682, 842, 1046]",591.637795,2.572624,0.058905,458.744853,2.099222,0.367377,317.403342,1.016883
4,0,35,20,7,Median,"[0, 101, 246, 554, 682, 856, 976, 1046]",561.907943,2.643793,0.055811,447.194042,4.757709,0.380541,295.011202,0.954349
5,0,35,15,8,Median,"[0, 101, 246, 554, 682, 789, 856, 976, 1046]",445.311387,2.465364,0.060602,403.837794,4.836544,0.369928,253.451875,1.04758
6,0,35,15,9,Median,"[0, 101, 246, 304, 554, 682, 789, 856, 976, 1046]",298.59901,2.162756,0.060668,362.302324,4.513938,0.327996,180.538247,1.344671
7,0,35,15,10,Median,"[0, 101, 167, 246, 304, 554, 682, 789, 856, 97...",246.765239,1.931269,0.031721,327.643215,4.438314,0.291454,147.799429,1.495095
8,0,35,15,11,Median,"[0, 101, 167, 246, 304, 509, 554, 682, 789, 85...",171.921607,1.752412,0.030197,301.69635,4.447445,0.268016,106.839614,1.687698
9,0,35,10,12,Median,"[0, 101, 167, 246, 304, 509, 554, 682, 789, 84...",152.133663,1.762457,0.002503,281.263004,4.112239,0.25936,98.609576,1.666004


In [9]:
try_params(features_tda_umap, 'tda_umap')

100%|██████████| 3136/3136 [2:27:24<00:00,  2.82s/it]  


Unnamed: 0,scale,n_clusters_min,n_clusters_max,k_neighbours_min,k_neighbours_max,len_st_thr,n_cl_max_thr,k_neighb_max_thr,n_edge_clusters_min,n_edge_clusters_max,...,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,time,AMI,ARI,FMI
0,False,2,2,20,25,[0],[2],[25],8,8,...,0.055764,400.27696,1.683153,0.193274,68.808055,1.529976,2.212798,0.399592,0.103299,0.376144
1,False,2,2,20,30,[0],[2],[30],8,8,...,0.036657,302.09252,1.942815,0.169087,35.693917,1.662311,1.636544,0.390818,0.094453,0.366944
2,False,2,2,20,35,[0],[2],[35],8,8,...,0.036657,302.09252,1.942815,0.169087,35.693917,1.662311,0.685956,0.390818,0.094453,0.366944
3,False,2,2,20,40,[0],[2],[40],8,8,...,0.026275,241.628393,2.847493,0.169902,19.893034,1.686818,0.891408,0.388644,0.093983,0.366456
4,False,2,2,20,45,[0],[2],[45],8,8,...,0.026275,241.628393,2.847493,0.169902,19.893034,1.686818,1.064749,0.388644,0.093983,0.366456


#### PCA

In [7]:
features_tda_pca = sklearn.decomposition.PCA(n_components = 15, svd_solver = "full", random_state = 42).fit_transform(features_tda)
print(features_tda_pca.shape)

(1046, 15)


In [8]:
try_default(features_tda_pca)

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Target time: 41.18349742889404


Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,45,20,3,Mode,"[0, 486, 682, 1046]",179502.241091,35.843513,0.040571,82.30239,3.388922,0.114771,105.505017,2.131956
1,40,35,20,4,Median,"[0, 203, 556, 857, 1046]",81196.745253,22.897993,0.04306,63.164577,4.435869,0.089508,50.796989,3.977806
2,40,50,20,5,Median,"[0, 203, 553, 682, 888, 1046]",144320.297572,37.524097,0.042112,77.773593,4.042086,0.161853,95.299518,2.181056
3,60,45,20,6,Median,"[0, 115, 229, 553, 682, 888, 1046]",108096.952769,34.873677,0.021109,70.194852,3.879325,0.144336,71.596595,2.264679
4,60,45,20,7,Median,"[0, 115, 229, 553, 682, 855, 976, 1046]",125856.905556,41.155505,0.023514,77.795779,3.687628,0.193352,85.79506,1.767761
5,60,45,20,8,Median,"[0, 115, 229, 553, 682, 776, 857, 976, 1046]",99944.351077,39.755528,0.025233,71.638268,3.822953,0.195677,72.509546,1.845411
6,60,45,20,9,Median,"[0, 115, 225, 330, 553, 682, 776, 857, 976, 1046]",78758.062563,35.62409,0.023595,65.201323,3.928928,0.170196,58.607912,2.263787
7,20,35,20,10,Mode,"[0, 92, 203, 330, 486, 556, 682, 776, 857, 976...",62762.548118,33.862669,0.002893,62.039727,3.812711,0.157791,47.314711,2.205346
8,0,35,20,11,Mode,"[0, 92, 141, 203, 330, 486, 556, 682, 776, 857...",53723.418038,31.681903,-0.001336,56.943892,3.987689,0.152268,40.733384,2.438729
9,0,35,20,12,Mode,"[0, 92, 141, 203, 330, 486, 556, 682, 776, 857...",46142.749616,30.471238,-0.000719,52.737784,4.214455,0.143532,35.634073,2.533037


In [9]:
try_params(features_tda_pca, 'tda_pca')

100%|██████████| 3136/3136 [2:16:13<00:00,  2.61s/it]  


Unnamed: 0,scale,n_clusters_min,n_clusters_max,k_neighbours_min,k_neighbours_max,len_st_thr,n_cl_max_thr,k_neighb_max_thr,n_edge_clusters_min,n_edge_clusters_max,...,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,time,AMI,ARI,FMI
0,False,2,2,20,25,[0],[2],[25],8,8,...,-0.074522,41.225875,4.659086,0.042853,28.871068,3.093114,2.0375,0.668684,0.404484,0.515509
1,False,2,2,20,30,[0],[2],[30],8,8,...,-0.059411,42.160522,4.249654,0.046078,18.955517,3.035495,1.259006,0.687883,0.423926,0.526561
2,False,2,2,20,35,[0],[2],[35],8,8,...,-0.059486,41.814069,4.309298,0.041919,18.405128,3.135726,0.569469,0.686548,0.42338,0.5263
3,False,2,2,20,40,[0],[2],[40],8,8,...,-0.030519,41.906847,4.893153,0.072558,19.560216,3.524039,0.711343,0.70191,0.42095,0.520222
4,False,2,2,20,45,[0],[2],[45],8,8,...,-0.030519,41.906847,4.893153,0.072558,19.560216,3.524039,0.916076,0.70191,0.42095,0.520222


### Neurofeatures

In [6]:
df_ft_psd_loc_db = pandas.read_feather(f'{subj}/src/df_ft_psd_loc_db.feather')
df_ft_psd_ind_loc_log = pandas.read_feather(f'{subj}/src/df_ft_psd_ind_loc_log.feather')
df_ft_coh_ind_loc = pandas.read_feather(f'{subj}/src/df_ft_coh_ind_loc.feather')
df_ft_plv_ind_loc = pandas.read_feather(f'{subj}/src/df_ft_plv_ind_loc.feather')

features_neuro = pandas.concat([ df_ft_psd_loc_db, df_ft_psd_ind_loc_log, df_ft_coh_ind_loc, df_ft_plv_ind_loc ], axis = 1)
print(features_neuro.shape)

features_neuro = sklearn.preprocessing.StandardScaler().fit_transform(features_neuro)
print(features_neuro.shape)

(1046, 765)
(1046, 765)


#### UMAP

In [7]:
features_neuro_umap = umap.UMAP(n_components = 15, random_state = 42).fit_transform(features_neuro)
print(features_neuro_umap.shape)

(1046, 15)


In [8]:
try_default(features_neuro_umap)

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Target time: 34.322808027267456


Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,35,15,3,Median,"[0, 232, 681, 1046]",2337.918949,3.473309,0.225711,567.432745,1.859608,0.332809,606.796648,1.545486
1,0,35,10,4,Mode,"[0, 263, 681, 777, 1046]",2692.104897,5.128663,0.249266,754.597017,2.894817,0.587176,804.180915,0.55425
2,0,35,10,5,Mode,"[0, 263, 555, 681, 857, 1046]",1582.330946,3.882507,0.239389,589.867979,1.457409,0.453061,533.17609,0.871468
3,40,45,15,6,Median,"[0, 263, 555, 681, 777, 976, 1046]",1529.732542,4.037761,0.218712,684.150707,1.750357,0.494924,695.20629,0.910106
4,0,35,10,7,Median,"[0, 103, 263, 555, 681, 777, 976, 1046]",1074.482228,3.629237,0.183207,712.889985,1.641383,0.478156,547.63023,0.904382
5,0,35,10,8,Median,"[0, 103, 263, 555, 681, 777, 857, 976, 1046]",871.48221,3.53066,0.195523,739.037591,2.466327,0.519981,502.692844,0.72729
6,0,35,10,9,Median,"[0, 103, 263, 492, 555, 681, 777, 857, 976, 1046]",715.690314,3.34122,0.228852,756.272604,2.270272,0.510964,452.897032,0.736405
7,0,35,15,10,Median,"[0, 103, 194, 263, 492, 555, 681, 777, 857, 97...",525.65664,2.982591,0.217626,735.812963,2.173425,0.487485,348.323992,0.794628
8,0,35,15,11,Median,"[0, 39, 103, 194, 263, 492, 555, 681, 777, 857...",468.736672,2.78558,0.22304,693.776986,2.035597,0.495106,325.40307,0.780897
9,0,35,10,12,Median,"[0, 39, 103, 194, 232, 263, 492, 555, 681, 777...",399.733147,2.548545,0.212193,635.193597,2.226929,0.460038,276.30398,0.95983


In [9]:
try_params(features_neuro_umap, 'neuro_umap')

100%|██████████| 3136/3136 [2:08:20<00:00,  2.46s/it]  


Unnamed: 0,scale,n_clusters_min,n_clusters_max,k_neighbours_min,k_neighbours_max,len_st_thr,n_cl_max_thr,k_neighb_max_thr,n_edge_clusters_min,n_edge_clusters_max,...,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,time,AMI,ARI,FMI
0,False,2,2,20,25,[0],[2],[25],8,8,...,0.01745,266.348071,1.930392,0.226254,41.531681,1.603961,1.98551,0.406684,0.118647,0.400623
1,False,2,2,20,30,[0],[2],[30],8,8,...,0.01745,266.348071,1.930392,0.226254,41.531681,1.603961,1.216686,0.406684,0.118647,0.400623
2,False,2,2,20,35,[0],[2],[35],8,8,...,0.01745,266.348071,1.930392,0.226254,41.531681,1.603961,0.504297,0.406684,0.118647,0.400623
3,False,2,2,20,40,[0],[2],[40],8,8,...,0.01745,266.348071,1.930392,0.226254,41.531681,1.603961,0.631428,0.406684,0.118647,0.400623
4,False,2,2,20,45,[0],[2],[45],8,8,...,0.01745,266.348071,1.930392,0.226254,41.531681,1.603961,0.775011,0.406684,0.118647,0.400623


#### PCA

In [7]:
features_neuro_pca = sklearn.decomposition.PCA(n_components = 15, svd_solver = 'full', random_state = 42).fit_transform(features_neuro)
print(features_neuro_pca.shape)

(1046, 15)


In [8]:
try_default(features_neuro_pca)

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Target time: 42.36500000953674


Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,35,10,3,Mode,"[0, 282, 560, 1046]",42308.816307,16.400094,0.072681,114.988709,2.722181,0.12593,108.777275,2.438303
1,20,40,15,4,Median,"[0, 168, 560, 857, 1046]",33307.136107,15.081889,0.070873,89.78073,3.560225,0.127895,86.288835,3.119853
2,40,50,10,5,Median,"[0, 178, 560, 682, 857, 1046]",45198.297789,21.835308,0.084183,105.595294,2.586946,0.211379,121.129124,1.7979
3,0,35,20,6,Mode,"[0, 39, 282, 560, 682, 857, 1046]",34637.660711,20.807593,0.058036,91.372564,2.433889,0.189075,94.179967,1.796114
4,0,40,20,7,Mode,"[0, 39, 282, 560, 682, 857, 976, 1046]",33803.157325,21.727724,0.027373,87.867345,3.08592,0.197685,90.699464,1.694186
5,60,45,20,8,Median,"[0, 104, 277, 557, 682, 784, 857, 976, 1046]",27148.763733,20.782267,0.051638,88.107028,3.174767,0.202978,77.226016,1.764923
6,0,45,20,9,Mode,"[0, 39, 282, 492, 560, 682, 784, 857, 976, 1046]",24209.602081,21.382082,0.040342,81.951371,2.922261,0.199396,69.908675,1.637018
7,20,40,15,10,Mode,"[0, 92, 154, 282, 492, 560, 682, 784, 857, 976...",19921.966427,19.978023,0.055157,80.814883,2.9413,0.197371,59.065692,1.797313
8,40,40,15,11,Mode,"[0, 92, 154, 282, 492, 560, 609, 682, 784, 857...",17433.673781,20.010478,0.047344,76.533681,2.935801,0.20521,56.018632,1.818006
9,40,40,20,12,Mode,"[0, 95, 154, 282, 492, 560, 609, 682, 784, 857...",15496.871841,19.770975,0.048565,72.453995,2.928984,0.20476,51.143214,1.887942


In [9]:
try_params(features_neuro_pca, 'neuro_pca')

100%|██████████| 3136/3136 [1:58:08<00:00,  2.26s/it]  


Unnamed: 0,scale,n_clusters_min,n_clusters_max,k_neighbours_min,k_neighbours_max,len_st_thr,n_cl_max_thr,k_neighb_max_thr,n_edge_clusters_min,n_edge_clusters_max,...,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,time,AMI,ARI,FMI
0,False,2,2,20,25,[0],[2],[25],8,8,...,-0.081282,35.135497,2.413497,0.086735,9.553277,1.70894,1.996677,0.400442,0.082085,0.359308
1,False,2,2,20,30,[0],[2],[30],8,8,...,-0.058299,31.718702,2.55925,0.147337,8.181718,1.941908,1.528433,0.395398,0.080831,0.358279
2,False,2,2,20,35,[0],[2],[35],8,8,...,-0.058299,31.718702,2.55925,0.147337,8.181718,1.941908,0.507077,0.395398,0.080831,0.358279
3,False,2,2,20,40,[0],[2],[40],8,8,...,-0.046995,31.4588,2.631693,0.127383,8.427922,1.985735,0.646606,0.394389,0.080078,0.357494
4,False,2,2,20,45,[0],[2],[45],8,8,...,-0.041182,31.395527,2.792691,0.127278,8.538213,2.24047,0.83481,0.393217,0.079352,0.356738
