In [7]:
import os

subj = "Subj1"
exp = "exp_dimreduct_algos"
os.makedirs(f"{subj}/{exp}", exist_ok = True)

In [8]:
import SDA
import SDA.analytics
import SDA.clustquality

import umap
import tqdm
import numpy
import pandas
import sklearn.manifold
import sklearn.preprocessing
import sklearn.decomposition

In [9]:
edges_true = numpy.loadtxt(f"{subj}/reproduction/internal/best_edges.txt").astype(numpy.int32)
features = pandas.read_feather(f'{subj}/exp_final/all_features.feather')
print(features.shape)

(1046, 19563)


In [10]:
features = sklearn.preprocessing.StandardScaler().fit_transform(features)
print(features.shape)

(1046, 19563)


In [11]:
n_components = [ 8, 16, 32, 48, 64 ]
algorithms = [
    *[
        sklearn.decomposition.PCA(n_components = n_comp, svd_solver = "full", random_state = 42)
        for n_comp in n_components
    ],
    *[
        sklearn.manifold.TSNE(n_components = n_comp, method = 'exact', init = 'pca', random_state = 42)
        for n_comp in n_components
    ],
    *[
        umap.UMAP(n_components = n_comp, random_state = 42)
        for n_comp in n_components
    ]
]
print(algorithms)

[PCA(n_components=8, random_state=42, svd_solver='full'), PCA(n_components=16, random_state=42, svd_solver='full'), PCA(n_components=32, random_state=42, svd_solver='full'), PCA(n_components=48, random_state=42, svd_solver='full'), PCA(n_components=64, random_state=42, svd_solver='full'), TSNE(method='exact', n_components=8, random_state=42), TSNE(method='exact', n_components=16, random_state=42), TSNE(method='exact', n_components=32, random_state=42), TSNE(method='exact', n_components=48, random_state=42), TSNE(method='exact', n_components=64, random_state=42), UMAP(n_components=8, random_state=42), UMAP(n_components=16, random_state=42), UMAP(n_components=32, random_state=42), UMAP(n_components=48, random_state=42), UMAP(n_components=64, random_state=42)]


In [12]:
stats = [ ]
for algorithm in algorithms:
    features_reduced = algorithm.fit_transform(features)
    result, df_st_edges = SDA.SDA(scale = False).apply(features_reduced)

    metrics = [ ]
    for row in result['St_edges']:
        metrics.append(SDA.clustquality.cluster_metrics_ground(edges_true, row))
    result = pandas.concat([ result, pandas.DataFrame(metrics) ], axis = 1)
    
    best_result = SDA.analytics.best_result(result, key = 'Avg-Silh', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'Avg-Silh'
    stats.append(best_result)
    
    best_result = SDA.analytics.best_result(result, key = 'FMI', n_stages = 9)
    best_result["Algorithm"] = algorithm
    best_result["Key"] = 'FMI'
    stats.append(best_result)

df = pandas.DataFrame(stats)[["Algorithm", "Key", "Ward_dist", "Cen_dist", "Avg-Silh", "Avg-Cal-Har", "Avg-Dav-Bold", "AMI", "ARI", "FMI"]]
df

Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 16 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 32 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 48 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 64 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 16 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 32 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 48 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Applying to 1046 samples with 64 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 8 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 16 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 32 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 48 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Applying to 1046 samples with 64 features each
Running stage 1


  0%|          | 0/589 [00:00<?, ?it/s]

Running stage 2


  0%|          | 0/672 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Key,Ward_dist,Cen_dist,Avg-Silh,Avg-Cal-Har,Avg-Dav-Bold,AMI,ARI,FMI
0,"PCA(n_components=8, random_state=42, svd_solve...",Avg-Silh,150065.458276,49.076949,0.165358,58.747339,2.149554,0.853418,0.711349,0.75205
1,"PCA(n_components=8, random_state=42, svd_solve...",FMI,135808.815872,45.248494,0.137871,50.785711,2.275257,0.859598,0.757105,0.791641
2,"PCA(n_components=16, random_state=42, svd_solv...",Avg-Silh,142227.775848,47.47332,0.114551,38.14509,2.739588,0.827798,0.690648,0.733995
3,"PCA(n_components=16, random_state=42, svd_solv...",FMI,138204.820394,46.349884,0.101445,36.266941,2.653897,0.877463,0.770476,0.803083
4,"PCA(n_components=32, random_state=42, svd_solv...",Avg-Silh,148097.737653,49.755891,0.090978,27.861573,3.081275,0.815774,0.641759,0.691585
5,"PCA(n_components=32, random_state=42, svd_solv...",FMI,140526.742128,46.479713,0.075302,25.896323,3.320404,0.856699,0.762565,0.796192
6,"PCA(n_components=48, random_state=42, svd_solv...",Avg-Silh,138280.003382,50.275295,0.084795,21.451342,3.396575,0.760549,0.557424,0.619441
7,"PCA(n_components=48, random_state=42, svd_solv...",FMI,142407.39199,46.782563,0.063784,21.485278,3.548642,0.845728,0.737471,0.774698
8,"PCA(n_components=64, random_state=42, svd_solv...",Avg-Silh,155106.694643,50.402654,0.068443,20.688588,3.613995,0.837377,0.673848,0.719434
9,"PCA(n_components=64, random_state=42, svd_solv...",FMI,128148.781924,48.051903,0.060771,16.85305,3.746432,0.834766,0.761691,0.795696
