In [1]:
import os

subj = "Subj1"
exp = "exp_feature_selection"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [2]:
%load_ext autoreload
%autoreload 2

import warnings
import itertools
warnings.filterwarnings('ignore')

import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import sklearn.manifold
import sklearn.preprocessing
import sklearn.decomposition
import tqdm.contrib.itertools
import sklearn.feature_selection

In [3]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
channel_features = pandas.read_feather(f'{subj}/exp_final_filtered/channel_features.feather')
overall_features = pandas.read_feather(f'{subj}/exp_final_filtered/overall_features.feather')
print(channel_features.shape)
print(overall_features.shape)

features = pandas.concat((channel_features, overall_features), axis = 1)
print(features.shape)

(1046, 3680)
(1046, 119)
(1046, 3799)


In [4]:
features_reduced = umap.UMAP(n_components = 15, random_state = 42).fit_transform(features)
features_reduced.shape

(1046, 15)

In [5]:
target_result, target_df_st_edges = SDA.SDA(scale = False, verbose = True).apply(features_reduced)
SDA.analytics.best_results(target_result, key = 'Avg-Silh')

Applying to 1046 samples with 15 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,St_len_min,K_nb_max,N_cl_max,N_stages,Cl_cen,St_edges,Ward_dist,Cen_dist,Silh,Cal-Har,Dav-Bold,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold
0,60,35,10,3,Mode,"[0, 259, 525, 1046]",1536.99465,2.709518,0.165418,586.461732,4.473386,0.283651,412.201524,3.522883
1,60,35,15,4,Median,"[0, 259, 525, 681, 1046]",1082.23184,2.316227,0.150209,406.815528,5.59698,0.233686,287.530227,2.59554
2,60,35,10,5,Mode,"[0, 194, 259, 525, 659, 1046]",277.767517,1.903827,0.03856,320.107686,5.066319,0.201705,65.41794,2.232126
3,40,40,20,6,Median,"[0, 101, 260, 553, 666, 855, 1046]",508.184206,2.069583,-0.00523,276.73926,5.929662,0.200428,128.184204,1.995328
4,20,40,10,7,Median,"[0, 101, 238, 273, 525, 681, 855, 1046]",137.453638,1.604573,9.5e-05,226.973225,6.382335,0.184828,40.085052,2.452417
5,40,40,20,8,Median,"[0, 101, 259, 326, 490, 554, 666, 855, 1046]",198.58623,1.799726,-0.054832,211.094161,4.687357,0.173856,57.65852,2.18649
6,40,35,20,9,Mode,"[0, 101, 259, 319, 490, 554, 659, 779, 855, 1046]",155.792017,1.649132,-0.049132,185.093249,6.057957,0.165211,46.746504,2.629805
7,40,40,20,10,Median,"[0, 101, 194, 259, 326, 490, 554, 666, 779, 85...",116.544961,1.511615,-0.067928,169.101505,6.016909,0.15039,35.269121,2.645584
8,60,50,15,11,Median,"[0, 101, 194, 238, 260, 320, 490, 554, 659, 77...",79.879499,1.359525,-0.068605,150.006031,5.834716,0.135611,25.643789,2.90055
9,60,50,20,12,Median,"[0, 101, 194, 259, 273, 326, 362, 490, 554, 66...",76.717539,1.379763,-0.074898,141.024926,5.900584,0.131561,25.133071,3.04225


In [6]:
params = [ ]

len_st_thr_attempts = [
    # [ 0 ],
    [ 20 ],
    [ 40 ],
    # [ 60 ],
    # [ 0, 20 ],
    # [ 0, 40 ],
    # [ 0, 60 ],
    # [ 20, 40 ],
    # [ 20, 60 ],
    # [ 40, 60 ],
    # [ 0, 20, 40 ],
    # [ 0, 20, 60 ],
    # [ 0, 40, 60 ],
    # [ 20, 40, 60 ],
    # [ 0, 20, 40, 60 ]
]
n_cl_max_thr_attempts = [
    [10],
    # [15],
    [20],
    # [10, 15],
    # [10, 20],
    # [15, 20],
    # [10, 15, 20]
]
k_neighb_max_thr_attempts = [
    [35],
    [40],
    [45],
    [50],
    # [35, 40],
    # [35, 45],
    # [35, 50],
    # [40, 45],
    # [40, 50],
    # [45, 50],
    # [40, 45, 50],
    # [35, 45, 50],
    # [35, 40, 50],
    # [35, 40, 45],
    # [35, 40, 45, 50]
]
for (
    n_clusters_min,
    k_neighbours_min,
    len_st_thr,
    n_cl_max_thr,
    k_neighb_max_thr
) in itertools.product(
    [ 2, 7, 12 ], # range(2, 21)
    [ 20, 30, 40 ], # range(20, 51),
    len_st_thr_attempts,
    n_cl_max_thr_attempts,
    k_neighb_max_thr_attempts
):
    n_clusters_max_attempts = [ ]
    if n_clusters_min <= 5: n_clusters_max_attempts.append(5)
    if n_clusters_min <= 10: n_clusters_max_attempts.append(10)
    if n_clusters_min <= 15: n_clusters_max_attempts.append(15)

    for (
        n_clusters_max,
        k_neighbours_max
    ) in itertools.product(
        n_clusters_max_attempts, # range(n_clusters_min, 21),
        range(k_neighbours_min, 51, 10), # range(k_neighbours_min, 51)
    ):
        params.append({
            'scale': False,
            
            'n_clusters_min': n_clusters_min, 'n_clusters_max': n_clusters_max,
            'k_neighbours_min': k_neighbours_min, 'k_neighbours_max': k_neighbours_max,
            'len_st_thr': len_st_thr,

            'n_cl_max_thr': n_cl_max_thr,
            'k_neighb_max_thr': k_neighb_max_thr,
            'n_edge_clusters_min': 8, 'n_edge_clusters_max': 8
        })

print(len(params))

864


In [7]:
results = [ ]
for param in tqdm.tqdm(params):
    result, df_st_edges = SDA.SDA(**param, verbose = False).apply(features_reduced)
    best_result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = 9)
    results.append(dict(**param, **best_result))
results = pandas.DataFrame(results)
results.to_csv(f"{subj}/{exp}/results.csv")
display(results)

 19%|█▉        | 163/864 [57:40<4:08:02, 21.23s/it]


KeyboardInterrupt: 