In [57]:
import os

subj = "Subj1"
N_STAGES = 9

N_SELECTION_STAGES = 5
RERUN = False

exp = "exp_select_features_all"
features_type = 'exp_final_filtered'
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [58]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import joblib
import sklearn.metrics
import sklearn.preprocessing
import sklearn.decomposition
import tqdm.contrib.itertools
import matplotlib.pyplot as plt
import sklearn.feature_selection
import sklearn.cross_decomposition

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
def explained_variance(features, reduced):
    pls = sklearn.cross_decomposition.PLSRegression(n_components = reduced.shape[1])
    y_pred = pls.fit(reduced, features).predict(reduced)
    return sklearn.metrics.r2_score(features, y_pred, multioutput = "variance_weighted")

### Selection

In [60]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
df_features = pandas.read_feather(f'{subj}/{features_type}/all_features.feather')
print(df_features.shape)

features = sklearn.preprocessing.StandardScaler().fit_transform(df_features)
print(features.shape)

params = {
    'n_clusters_min': 2, 'n_clusters_max': 8,
    'k_neighbours': range(20, 51, 3),
    'len_st_thr': [ 40 ],

    'n_cl_max_thr': [ 20 ],
    'k_neighb_max_thr': [ 50 ],
    'n_edge_clusters_min': N_SELECTION_STAGES - 1, 'n_edge_clusters_max': N_SELECTION_STAGES - 1
}

(1046, 3799)
(1046, 3799)


In [61]:
def score_feature(i: int):
    try:
        result, _ = SDA.SDA(**params, n_jobs = 1, scale = False, verbose = False).apply(features[:, i].reshape(-1, 1))
        result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = N_SELECTION_STAGES)
        if len(result['St_edges']) != N_SELECTION_STAGES + 1:
            raise RuntimeError()
        score = result['Avg-Silh']
    except:
        score = -1
    source_feature = df_features[df_features.columns[i]]
    edges = numpy.array(result['St_edges'] if score != -1 else [])
    return {
        'index': i,
        'name': df_features.columns[i],
        'score': score,
        'mean': source_feature.mean(),
        'variance': source_feature.var(),
        'unique_values': len(numpy.unique(features[:, i])),
        'edges': result['St_edges'] if score != -1 else None,
        'min_stage_length': (edges[1:] - edges[:-1]).min() if score != -1 else None
    }

In [62]:
if RERUN:
    scores = joblib.Parallel(n_jobs = 14)(joblib.delayed(score_feature)(i) for i in tqdm.trange(features.shape[1]))
    scores = pandas.DataFrame(list(scores))
else:
    scores = pandas.read_csv(f"{subj}/{exp}/scores_{features_type[4:]}.csv")
display(scores)

Unnamed: 0,index,name,score,mean,variance,unique_values,edges,min_stage_length
0,0,channel-0 entropy dim-1,0.250481,-0.381905,1.793640e-03,1046,"[0, 65, 246, 556, 659, 1046]",65.0
1,1,channel-0 entropy dim-2,0.100504,-0.135152,2.854760e-03,1024,"[0, 105, 588, 821, 971, 1046]",75.0
2,2,channel-0 numberofpoints dim-1,0.209758,62.464627,1.683160e+02,72,"[0, 98, 238, 550, 650, 1046]",98.0
3,3,channel-0 numberofpoints dim-2,0.104908,8.312620,2.473758e+01,35,"[0, 136, 585, 823, 975, 1046]",71.0
4,4,channel-0 amplitude-bottleneck dim-1,0.195806,0.000004,6.940859e-12,1046,"[0, 50, 512, 556, 647, 1046]",44.0
...,...,...,...,...,...,...,...,...
3794,3794,overall bd2 dim-3 mean,0.059686,0.000094,2.766734e-09,861,"[0, 82, 357, 552, 777, 1046]",82.0
3795,3795,overall bd2 dim-3 std,0.020077,0.000006,7.849244e-11,700,"[0, 113, 278, 553, 799, 1046]",113.0
3796,3796,overall bd2 dim-3 sum,0.045086,0.000429,2.310777e-07,861,"[0, 110, 313, 526, 804, 1046]",110.0
3797,3797,overall bd2 dim-3 norm-1,0.045086,0.000429,2.310777e-07,861,"[0, 110, 313, 526, 804, 1046]",110.0


In [63]:
scores.to_csv(f"{subj}/{exp}/scores_{features_type[4:]}.csv", index = False)
display(scores.sort_values(by = 'score', ascending = False))

Unnamed: 0,index,name,score,mean,variance,unique_values,edges,min_stage_length
2617,2617,channel-28 amplitude-silhouette-1-1 dim-2,0.629530,1.546327e-13,2.071976e-25,1012,"[0, 266, 272, 322, 332, 1046]",6.0
2341,2341,channel-25 amplitude-silhouette-1-1 dim-2,0.539729,1.384475e-13,3.523781e-25,955,"[0, 124, 179, 711, 768, 1046]",55.0
2625,2625,channel-28 amplitude-silhouette-2-1 dim-2,0.534404,1.201888e-10,6.708030e-20,1012,"[0, 266, 272, 310, 332, 1046]",6.0
2347,2347,channel-25 amplitude-silhouette-1-2 norm-2,0.520269,2.629310e-12,1.966975e-22,1046,"[0, 141, 183, 691, 790, 1046]",42.0
2621,2621,channel-28 amplitude-silhouette-1-2 dim-2,0.520216,2.070312e-13,3.000437e-25,1012,"[0, 250, 266, 272, 332, 1046]",6.0
...,...,...,...,...,...,...,...,...
1056,1056,channel-11 amplitude-silhouette-1-2 dim-1,-1.000000,4.903853e-12,2.184737e-22,1046,,
1058,1058,channel-11 amplitude-silhouette-1-2 norm-1,-1.000000,5.160625e-12,2.199826e-22,1046,,
1059,1059,channel-11 amplitude-silhouette-1-2 norm-2,-1.000000,4.934292e-12,2.183998e-22,1046,,
2249,2249,channel-24 amplitude-silhouette-1-1 dim-2,-1.000000,6.774115e-14,1.656250e-26,1002,,


In [64]:
NUM_FEATURES = 765
MIN_STAGE_LENGTH = 40
CORR_THRESHOLD = 0.99
MIN_UNIQUE_VALUES = 20

best_feature_keys = numpy.empty(0,)
best_features = numpy.empty((features.shape[0], 0))
for i, (idx, row) in enumerate(scores.sort_values(by = 'score', ascending = False).iterrows()):
    if row['unique_values'] < MIN_UNIQUE_VALUES:
        print(f'{i + 1}. {row["name"]} - SKIP (unique_values = {row["unique_values"]})')
        continue
    if row['min_stage_length'] < MIN_STAGE_LENGTH:
        print(f'{i + 1}. {row["name"]} - SKIP (min_stage_length = {row["min_stage_length"]})')
        continue

    key = row["name"].split(' ', 1)[0]
    feature = features[:, row['index']].reshape(-1, 1)

    same_key = best_features[:, best_feature_keys == key]
    if same_key.shape[1] != 0:
        for_corrs = numpy.concatenate((same_key, feature), axis = 1)
        corrs = numpy.corrcoef(for_corrs.T)[-1, :-1]
        max_corr = numpy.max(numpy.abs(corrs))
        if max_corr > CORR_THRESHOLD:
            print(f'{i + 1}. {row["name"]} - SKIP (max_corr = {max_corr})')
            continue

    best_features = numpy.concatenate((best_features, feature), axis = 1)
    best_feature_keys = numpy.append(best_feature_keys, key)
    print(f'{i + 1}. {row["name"]} - OK')
    if best_features.shape[1] == NUM_FEATURES:
        break

print(best_features.shape)
print(numpy.unique(best_feature_keys, return_counts = True))

1. channel-28 amplitude-silhouette-1-1 dim-2 - SKIP (min_stage_length = 6.0)
2. channel-25 amplitude-silhouette-1-1 dim-2 - OK
3. channel-28 amplitude-silhouette-2-1 dim-2 - SKIP (min_stage_length = 6.0)
4. channel-25 amplitude-silhouette-1-2 norm-2 - OK
5. channel-28 amplitude-silhouette-1-2 dim-2 - SKIP (min_stage_length = 6.0)
6. channel-25 amplitude-silhouette-1-1 norm-2 - SKIP (max_corr = 0.9956076954121463)
7. channel-25 amplitude-silhouette-1-1 dim-1 - SKIP (max_corr = 0.9954615657848307)
8. channel-25 amplitude-silhouette-1-1 norm-1 - SKIP (max_corr = 0.995021316811561)
9. channel-25 amplitude-landscape-1-1 dim-1 - OK
10. channel-25 amplitude-landscape-1-1 norm-2 - SKIP (max_corr = 0.9999085155774673)
11. channel-25 amplitude-silhouette-1-2 dim-2 - SKIP (max_corr = 0.9937187415085188)
12. channel-25 amplitude-silhouette-1-2 dim-1 - SKIP (max_corr = 0.9997036548511935)
13. channel-25 amplitude-landscape-1-1 dim-2 - OK
14. channel-25 amplitude-silhouette-2-2 dim-1 - OK
15. channe

### UMAP

In [51]:
tda_umap = umap.UMAP(n_components = 15, random_state = 42)
umap_features = tda_umap.fit_transform(best_features)
print(umap_features.shape)

(1046, 15)


In [52]:
umap_result, _ = SDA.SDA(n_jobs = 15, scale = False, verbose = True).apply(umap_features)
display(SDA.analytics.best_results(umap_result, key = 'Avg-Silh'))

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,40,10,3,Mode,"[0, 284, 682, 1046]",1706.870221,2.603137,0.191917,609.442343,3.490241,0.297464,579.615649,2.765555
1,0,40,15,4,Mode,"[0, 284, 682, 787, 1046]",1429.672018,3.149985,0.168557,520.706908,8.777928,0.361712,492.073092,0.910005
2,0,45,15,5,Mode,"[0, 284, 554, 682, 787, 1046]",1019.236998,3.231717,0.166938,541.486137,2.237017,0.470157,561.328663,0.756607
3,0,40,10,6,Mode,"[0, 284, 554, 682, 842, 976, 1046]",921.508051,3.122669,0.140647,515.485316,3.087407,0.4548,458.806916,0.81262
4,20,40,20,7,Mode,"[0, 284, 556, 682, 787, 842, 976, 1046]",709.852606,2.864808,0.131176,462.514553,3.362688,0.448195,405.940064,0.853905
5,0,40,20,8,Median,"[0, 3, 244, 556, 682, 787, 842, 976, 1046]",638.467706,2.806411,0.088632,426.37962,3.321983,0.458289,368.679214,0.961876
6,0,40,20,9,Median,"[0, 3, 244, 284, 556, 682, 787, 842, 976, 1046]",305.223013,2.470137,0.090352,404.792785,3.236274,0.426957,211.675822,1.084985
7,0,40,20,10,Median,"[0, 3, 244, 284, 493, 556, 682, 787, 842, 976,...",225.968835,2.212761,0.0923,378.370472,3.510031,0.389403,165.680902,1.240856
8,0,40,15,11,Median,"[0, 3, 244, 284, 493, 554, 664, 682, 787, 842,...",135.448963,2.063772,0.089382,343.879705,5.801489,0.377525,84.051367,1.307446
9,0,50,15,12,Median,"[0, 3, 244, 284, 493, 554, 664, 682, 787, 842,...",120.624487,1.946358,0.086382,313.796493,6.034127,0.351121,75.474343,1.528642


In [53]:
umap_best_result = SDA.analytics.best_result(umap_result, key = 'Avg-Silh', n_stages = N_STAGES)
umap_edges = umap_best_result['St_edges']

print('Features:', features.shape)
print('Best features:', best_features.shape)
print('UMAP features:', umap_features.shape)

print('Explained variance 15-765:', explained_variance(best_features, umap_features))
print('Explained variance 15-all:', explained_variance(features, umap_features))

print('Outer:', SDA.clustquality.cluster_metrics_ground(edges_true, umap_edges))

print('Inner (15):', SDA.clustquality.calc_stage_metr_noground(umap_features, umap_edges).mean().to_dict())
print('Inner (765):', SDA.clustquality.calc_stage_metr_noground(best_features, umap_edges).mean().to_dict())
print('Inner (all):', SDA.clustquality.calc_stage_metr_noground(features, umap_edges).mean().to_dict())

Features: (1046, 19563)
Best features: (1046, 765)
UMAP features: (1046, 15)
Explained variance 15-765: 0.3422217257998321
Explained variance 15-all: 0.14679260639896052
Outer: {'AMI': 0.8708619833167367, 'ARI': 0.7725567414667996, 'FMI': 0.8106495805228112}
Inner (15): {'Silh': 0.4269565939903259, 'Cal-Har': 211.6758216247128, 'Dav-Bold': 1.0849845305825214}
Inner (765): {'Silh': 0.1178591389331013, 'Cal-Har': 24.473057112493024, 'Dav-Bold': 2.729181312993469}
Inner (all): {'Silh': 0.04488074049083551, 'Cal-Har': 9.47753889677005, 'Dav-Bold': 4.49777586580225}


### PCA

In [54]:
pca = sklearn.decomposition.PCA(n_components = 15, svd_solver = "full", random_state = 42)
pca_features = pca.fit_transform(best_features)

print('Explained variance', round(pca.explained_variance_ratio_.sum(), 2))
print([ round(x, 3) for x in pca.explained_variance_ratio_ ])
print(pca_features.shape)
print(explained_variance(best_features, pca_features))

Explained variance 0.49
[0.211, 0.064, 0.036, 0.028, 0.019, 0.016, 0.015, 0.014, 0.013, 0.012, 0.012, 0.012, 0.011, 0.011, 0.011]
(1046, 15)
0.4864660073929153


In [55]:
pca_result, _ = SDA.SDA(n_jobs = 15, scale = False, verbose = True).apply(pca_features)
display(SDA.analytics.best_results(pca_result, key = 'Avg-Silh'))

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,35,10,3,Median,"[0, 213, 682, 1046]",35145.347472,13.158251,0.074215,124.241892,4.429616,0.152337,122.721051,3.626449
1,0,45,10,4,Mode,"[0, 239, 682, 795, 1046]",38089.486164,18.229403,0.053647,118.198456,6.682038,0.200516,137.295645,1.568596
2,0,45,10,5,Mode,"[0, 239, 554, 682, 795, 1046]",36838.467684,20.618401,0.062161,133.886675,2.902583,0.281205,157.921585,1.382615
3,20,35,20,6,Mode,"[0, 213, 556, 682, 856, 976, 1046]",34756.845805,20.612277,0.055596,124.75399,3.768089,0.283627,152.129359,1.356516
4,20,35,20,7,Mode,"[0, 141, 213, 556, 682, 856, 976, 1046]",25478.954688,18.57495,0.028746,108.183795,3.897923,0.248688,112.133862,1.651062
5,20,40,20,8,Mode,"[0, 55, 213, 313, 556, 682, 856, 976, 1046]",20688.313862,16.705467,0.029576,99.526098,3.513772,0.218665,90.368112,1.767305
6,20,40,20,9,Mode,"[0, 55, 213, 313, 556, 682, 792, 856, 976, 1046]",16431.631619,16.066765,0.00852,93.680736,3.92544,0.207043,73.424832,1.773108
7,0,35,20,10,Mode,"[0, 18, 141, 213, 349, 556, 682, 792, 856, 976...",14061.293589,15.257765,-0.005532,83.791576,3.776429,0.1892,62.711891,1.944495
8,60,35,10,11,Median,"[0, 167, 210, 239, 266, 340, 554, 682, 792, 85...",11695.398856,13.650928,-0.032317,74.425012,3.927537,0.166849,52.925911,2.110736
9,60,35,20,12,Median,"[0, 81, 185, 210, 239, 278, 349, 554, 682, 792...",10725.397483,12.947615,-0.027998,70.182669,4.139875,0.157623,47.965898,2.358974


In [56]:
pca_best_result = SDA.analytics.best_result(pca_result, key = 'Avg-Silh', n_stages = N_STAGES)
pca_edges = pca_best_result['St_edges']

print('Features:', features.shape)
print('Best features:', best_features.shape)
print('PCA features:', pca_features.shape)

print('Explained variance 15-765:', explained_variance(best_features, pca_features))
print('Explained variance 15-all:', explained_variance(features, pca_features))

print('Outer:', SDA.clustquality.cluster_metrics_ground(edges_true, pca_edges))

print('Inner (15):', SDA.clustquality.calc_stage_metr_noground(pca_features, pca_edges).mean().to_dict())
print('Inner (765):', SDA.clustquality.calc_stage_metr_noground(best_features, pca_edges).mean().to_dict())
print('Inner (all):', SDA.clustquality.calc_stage_metr_noground(features, pca_edges).mean().to_dict())

Features: (1046, 19563)
Best features: (1046, 765)
PCA features: (1046, 15)
Explained variance 15-765: 0.4864660073929153
Explained variance 15-all: 0.20552026612067126
Outer: {'AMI': 0.8551101496383234, 'ARI': 0.7216299947154539, 'FMI': 0.7609127844904733}
Inner (15): {'Silh': 0.20704251985090527, 'Cal-Har': 73.42483216299364, 'Dav-Bold': 1.7731084140173086}
Inner (765): {'Silh': 0.09140506261405176, 'Cal-Har': 26.789423143863274, 'Dav-Bold': 3.021600848334752}
Inner (all): {'Silh': 0.03274823026774569, 'Cal-Har': 10.093835018732683, 'Dav-Bold': 5.206962015523273}


### Traditional features

In [36]:
df_ft_psd_loc_db = pandas.read_feather(f'{subj}/src/df_ft_psd_loc_db.feather')
df_ft_psd_ind_loc_log = pandas.read_feather(f'{subj}/src/df_ft_psd_ind_loc_log.feather')
df_ft_coh_ind_loc = pandas.read_feather(f'{subj}/src/df_ft_coh_ind_loc.feather')
df_ft_plv_ind_loc = pandas.read_feather(f'{subj}/src/df_ft_plv_ind_loc.feather')

features_neuro = pandas.concat([ df_ft_psd_loc_db, df_ft_psd_ind_loc_log, df_ft_coh_ind_loc, df_ft_plv_ind_loc ], axis = 1)
print(features_neuro.shape)

features_neuro = sklearn.preprocessing.StandardScaler().fit_transform(features_neuro)
print(features_neuro.shape)

(1046, 765)
(1046, 765)


#### UMAP

In [37]:
neuro_umap = umap.UMAP(n_components = 15, random_state = 42)
features_neuro_umap = neuro_umap.fit_transform(features_neuro)
print(features_neuro_umap.shape)

(1046, 15)


In [38]:
neuro_umap_result, _ = SDA.SDA(n_jobs = 15, scale = False, verbose = True).apply(features_neuro_umap)
display(SDA.analytics.best_results(neuro_umap_result, key = 'Avg-Silh'))

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,35,15,3,Median,"[0, 232, 681, 1046]",2337.918949,3.473309,0.225711,567.432745,1.859608,0.332809,606.796648,1.545486
1,0,35,10,4,Mode,"[0, 263, 681, 777, 1046]",2692.104897,5.128663,0.249266,754.597017,2.894817,0.587176,804.180915,0.55425
2,0,35,10,5,Mode,"[0, 263, 555, 681, 857, 1046]",1582.330946,3.882507,0.239389,589.867979,1.457409,0.453061,533.17609,0.871468
3,40,45,15,6,Median,"[0, 263, 555, 681, 777, 976, 1046]",1529.732542,4.037761,0.218712,684.150707,1.750357,0.494924,695.20629,0.910106
4,0,35,10,7,Median,"[0, 103, 263, 555, 681, 777, 976, 1046]",1074.482228,3.629237,0.183207,712.889985,1.641383,0.478156,547.63023,0.904382
5,0,35,10,8,Median,"[0, 103, 263, 555, 681, 777, 857, 976, 1046]",871.48221,3.53066,0.195523,739.037591,2.466327,0.519981,502.692844,0.72729
6,0,35,10,9,Median,"[0, 103, 263, 492, 555, 681, 777, 857, 976, 1046]",715.690314,3.34122,0.228852,756.272604,2.270272,0.510964,452.897032,0.736405
7,0,35,15,10,Median,"[0, 103, 194, 263, 492, 555, 681, 777, 857, 97...",525.65664,2.982591,0.217626,735.812963,2.173425,0.487485,348.323992,0.794628
8,0,35,15,11,Median,"[0, 39, 103, 194, 263, 492, 555, 681, 777, 857...",468.736672,2.78558,0.22304,693.776986,2.035597,0.495106,325.40307,0.780897
9,0,35,10,12,Median,"[0, 39, 103, 194, 232, 263, 492, 555, 681, 777...",399.733147,2.548545,0.212193,635.193597,2.226929,0.460038,276.30398,0.95983


In [39]:
neuro_umap_best_result = SDA.analytics.best_result(neuro_umap_result, key = 'Avg-Silh', n_stages = N_STAGES)
neuro_umap_edges = neuro_umap_best_result['St_edges']

print('Neuro features:', features_neuro.shape)
print('UMAP neuro features:', features_neuro_umap.shape)

print('Explained variance 15-765:', explained_variance(features_neuro, features_neuro_umap))

print('Outer:', SDA.clustquality.cluster_metrics_ground(edges_true, neuro_umap_edges))

print('Inner (15):', SDA.clustquality.calc_stage_metr_noground(features_neuro_umap, neuro_umap_edges).mean().to_dict())
print('Inner (765):', SDA.clustquality.calc_stage_metr_noground(features_neuro, neuro_umap_edges).mean().to_dict())

Neuro features: (1046, 765)
UMAP neuro features: (1046, 15)
Explained variance 15-765: 0.50961167309074
Outer: {'AMI': 0.8989658031918278, 'ARI': 0.8201120691706356, 'FMI': 0.8459071662662822}
Inner (15): {'Silh': 0.5109639167785645, 'Cal-Har': 452.89703155772366, 'Dav-Bold': 0.7364054709635279}
Inner (765): {'Silh': 0.13754202043469593, 'Cal-Har': 42.38118519698051, 'Dav-Bold': 2.238258625900672}


#### PCA

In [40]:
neuro_pca = sklearn.decomposition.PCA(n_components = 15, svd_solver = "full", random_state = 42)
features_neuro_pca = neuro_pca.fit_transform(features_neuro)

print('Explained variance', round(neuro_pca.explained_variance_ratio_.sum(), 2))
print([ round(x, 3) for x in neuro_pca.explained_variance_ratio_ ])
print(features_neuro_pca.shape)
print(explained_variance(features_neuro, features_neuro_pca))

Explained variance 0.71
[0.211, 0.156, 0.069, 0.063, 0.046, 0.032, 0.026, 0.02, 0.018, 0.015, 0.013, 0.012, 0.01, 0.009, 0.009]
(1046, 15)
0.7106773827080092


In [41]:
neuro_pca_result, _ = SDA.SDA(n_jobs = 15, scale = False, verbose = True).apply(features_neuro_pca)
display(SDA.analytics.best_results(neuro_pca_result, key = 'Avg-Silh'))

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,0,35,10,3,Mode,"[0, 282, 560, 1046]",42308.816307,16.400094,0.072681,114.988709,2.722181,0.12593,108.777275,2.438303
1,20,40,15,4,Median,"[0, 168, 560, 857, 1046]",33307.136107,15.081889,0.070873,89.78073,3.560225,0.127895,86.288835,3.119853
2,40,50,10,5,Median,"[0, 178, 560, 682, 857, 1046]",45198.297789,21.835308,0.084183,105.595294,2.586946,0.211379,121.129124,1.7979
3,0,35,20,6,Mode,"[0, 39, 282, 560, 682, 857, 1046]",34637.660711,20.807593,0.058036,91.372564,2.433889,0.189075,94.179967,1.796114
4,0,40,20,7,Mode,"[0, 39, 282, 560, 682, 857, 976, 1046]",33803.157325,21.727724,0.027373,87.867345,3.08592,0.197685,90.699464,1.694186
5,60,45,20,8,Median,"[0, 104, 277, 557, 682, 784, 857, 976, 1046]",27148.763733,20.782267,0.051638,88.107028,3.174767,0.202978,77.226016,1.764923
6,0,45,20,9,Mode,"[0, 39, 282, 492, 560, 682, 784, 857, 976, 1046]",24209.602081,21.382082,0.040342,81.951371,2.922261,0.199396,69.908675,1.637018
7,20,40,15,10,Mode,"[0, 92, 154, 282, 492, 560, 682, 784, 857, 976...",19921.966427,19.978023,0.055157,80.814883,2.9413,0.197371,59.065692,1.797313
8,40,40,15,11,Mode,"[0, 92, 154, 282, 492, 560, 609, 682, 784, 857...",17433.673781,20.010478,0.047344,76.533681,2.935801,0.20521,56.018632,1.818006
9,40,40,20,12,Mode,"[0, 95, 154, 282, 492, 560, 609, 682, 784, 857...",15496.871841,19.770975,0.048565,72.453995,2.928984,0.20476,51.143214,1.887942


In [42]:
neuro_pca_best_result = SDA.analytics.best_result(neuro_pca_result, key = 'Avg-Silh', n_stages = N_STAGES)
neuro_pca_edges = neuro_pca_best_result['St_edges']

print('Neuro features:', features_neuro.shape)
print('PCA neuro features:', features_neuro_pca.shape)

print('Explained variance 15-765:', explained_variance(features_neuro, features_neuro_pca))

print('Outer:', SDA.clustquality.cluster_metrics_ground(edges_true, neuro_pca_edges))

print('Inner (15):', SDA.clustquality.calc_stage_metr_noground(features_neuro_pca, neuro_pca_edges).mean().to_dict())
print('Inner (765):', SDA.clustquality.calc_stage_metr_noground(features_neuro, neuro_pca_edges).mean().to_dict())

Neuro features: (1046, 765)
PCA neuro features: (1046, 15)
Explained variance 15-765: 0.7106773827080092
Outer: {'AMI': 1.0, 'ARI': 1.0, 'FMI': 1.0}
Inner (15): {'Silh': 0.1993961405684552, 'Cal-Har': 69.90867530727269, 'Dav-Bold': 1.637017876621227}
Inner (765): {'Silh': 0.13823275074674224, 'Cal-Har': 44.432973449387084, 'Dav-Bold': 2.115450457714017}
