In [1]:
import os

subj = "Subj1"
exp = "exp_dimreduct_algos"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [2]:
import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import sklearn.manifold
import sklearn.preprocessing
import sklearn.decomposition

In [3]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
features = pandas.read_feather(f'{subj}/exp_final/all_features.feather')
print(features.shape)

(1046, 19563)


In [4]:
features = sklearn.preprocessing.StandardScaler().fit_transform(features)
print(features.shape)

(1046, 19563)


In [5]:
n_components = [ 8, 16, 32, 48, 64 ]
algorithms = [
    *[
        sklearn.decomposition.PCA(n_components = n_comp, svd_solver = "full", random_state = 42)
        for n_comp in n_components
    ],
    *[
        sklearn.manifold.TSNE(n_components = n_comp, method = 'exact', init = 'pca', random_state = 42)
        for n_comp in n_components
    ],
    *[
        umap.UMAP(n_components = n_comp, random_state = 42)
        for n_comp in n_components
    ]
]
print(algorithms)

[PCA(n_components=8, random_state=42, svd_solver='full'), PCA(n_components=16, random_state=42, svd_solver='full'), PCA(n_components=32, random_state=42, svd_solver='full'), PCA(n_components=48, random_state=42, svd_solver='full'), PCA(n_components=64, random_state=42, svd_solver='full'), TSNE(method='exact', n_components=8, random_state=42), TSNE(method='exact', n_components=16, random_state=42), TSNE(method='exact', n_components=32, random_state=42), TSNE(method='exact', n_components=48, random_state=42), TSNE(method='exact', n_components=64, random_state=42), UMAP(n_components=8, random_state=42), UMAP(n_components=16, random_state=42), UMAP(n_components=32, random_state=42), UMAP(n_components=48, random_state=42), UMAP(n_components=64, random_state=42)]


In [6]:
stats = [ ]
for algorithm in algorithms:
    features_reduced = algorithm.fit_transform(features)
    result, df_st_edges = SDA.SDA(scale = False).apply(features_reduced)

    metrics = [ ]
    for row in result['St_edges']:
        metrics.append(SDA.clustquality.cluster_metrics_ground(edges_true, row))
    result = pandas.concat([ result, pandas.DataFrame(metrics) ], axis = 1)
    
    best_result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'Avg-Silh'
    stats.append(best_result)
    
    best_result = SDA.analytics.best_result(result, key = 'FMI', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'FMI'
    stats.append(best_result)

df = pandas.DataFrame(stats)[["Algorithm", "Key", "Ward_dist", "Cen_dist", "Avg-Silh", "Avg-Cal-Har", "Avg-Dav-Bold", "AMI", "ARI", "FMI"]]
df

Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

KeyboardInterrupt: 