In [43]:
import os

subj = "Subj1"
exp = "exp_filter"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [44]:
import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import sklearn.manifold
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.feature_selection

In [45]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
channel_features = pandas.read_feather(f'{subj}/exp_final/channel_features.feather')
overall_features = pandas.read_feather(f'{subj}/exp_final/overall_features.feather')
print(channel_features.shape)
print(overall_features.shape)

features = pandas.concat((channel_features, overall_features), axis = 1)
print(features.shape)

(1046, 8400)
(1046, 291)
(1046, 8691)


In [46]:
features_T = features.to_numpy().T
features_nondup = numpy.ascontiguousarray(features_T).view(numpy.dtype((numpy.void, features_T.dtype.itemsize * features_T.shape[1])))
features_nondup = numpy.unique(features_nondup).view(features_T.dtype).reshape(-1, features_T.shape[1]).T
features_nondup.shape

(1046, 7141)

In [47]:
features_nondup = sklearn.preprocessing.StandardScaler().fit_transform(features_nondup)
print(features_nondup.shape)

(1046, 7141)


In [48]:
n_components = [ 8, 16, 32, 48, 64 ]
algorithms = [
    *[
        sklearn.decomposition.PCA(n_components = n_comp, svd_solver = "full", random_state = 42)
        for n_comp in n_components
    ],
    *[
        umap.UMAP(n_components = n_comp, random_state = 42)
        for n_comp in n_components
    ]
]
print(algorithms)

[PCA(n_components=8, random_state=42, svd_solver='full'), PCA(n_components=16, random_state=42, svd_solver='full'), PCA(n_components=32, random_state=42, svd_solver='full'), PCA(n_components=48, random_state=42, svd_solver='full'), PCA(n_components=64, random_state=42, svd_solver='full'), UMAP(n_components=8, random_state=42), UMAP(n_components=16, random_state=42), UMAP(n_components=32, random_state=42), UMAP(n_components=48, random_state=42), UMAP(n_components=64, random_state=42)]


In [49]:
stats = [ ]
for algorithm in algorithms:
    features_reduced = algorithm.fit_transform(features_nondup)
    result, df_st_edges = SDA.SDA(scale = False).apply(features_reduced)

    metrics = [ ]
    for row in result['St_edges']:
        metrics.append(SDA.clustquality.cluster_metrics_ground(edges_true, row))
    result = pandas.concat([ result, pandas.DataFrame(metrics) ], axis = 1)
    
    best_result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'Avg-Silh'
    stats.append(best_result)
    
    best_result = SDA.analytics.best_result(result, key = 'FMI', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'FMI'
    stats.append(best_result)

df = pandas.DataFrame(stats)[["Algorithm", "Key", "Ward_dist", "Cen_dist", "Avg-Silh", "Avg-Cal-Har", "Avg-Dav-Bold", "AMI", "ARI", "FMI"]]
df

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 16 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 32 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 48 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 64 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Key,Ward_dist,Cen_dist,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,AMI,ARI,FMI
0,"UMAP(n_components=8, n_jobs=1, random_state=42...",Avg-Silh,258.178944,2.024138,0.299268,155.825733,1.359104,0.82702,0.665484,0.712177
1,"UMAP(n_components=8, n_jobs=1, random_state=42...",FMI,257.091097,1.995148,0.294314,154.410997,1.416181,0.834675,0.670428,0.716543
2,"UMAP(n_components=16, n_jobs=1, random_state=4...",Avg-Silh,260.652153,2.017073,0.305923,156.007292,1.401082,0.880943,0.754969,0.790308
3,"UMAP(n_components=16, n_jobs=1, random_state=4...",FMI,260.652153,2.017073,0.305923,156.007292,1.401082,0.880943,0.754969,0.790308
4,"UMAP(n_components=32, n_jobs=1, random_state=4...",Avg-Silh,261.635709,2.029388,0.299366,157.204852,1.380072,0.84089,0.704696,0.746476
5,"UMAP(n_components=32, n_jobs=1, random_state=4...",FMI,264.640447,2.018916,0.299256,158.10638,1.412796,0.860918,0.723428,0.762478
6,"UMAP(n_components=48, n_jobs=1, random_state=4...",Avg-Silh,262.531906,2.07777,0.325555,139.715037,1.246078,0.826219,0.674514,0.724344
7,"UMAP(n_components=48, n_jobs=1, random_state=4...",FMI,256.98907,1.999254,0.298823,153.470188,1.415406,0.867155,0.742414,0.7797
8,"UMAP(n_components=64, n_jobs=1, random_state=4...",Avg-Silh,260.002456,2.016642,0.295091,156.663437,1.38824,0.811397,0.647019,0.696165
9,"UMAP(n_components=64, n_jobs=1, random_state=4...",FMI,257.634033,2.000203,0.288737,153.454931,1.403112,0.818665,0.664799,0.711599
