In [1]:
# -*- coding: utf-8 -*-
##############
#  Packages  #
##############
import os
import sys 


from pathlib import Path
import seaborn as sns
import numpy as np
import plotly.express as px
from plotly.offline import plot
import plotly.io as pio

import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#from umap import UMAP

from tqdm import tqdm


pio.renderers = "browser"

##################
#      Imports   #
##################
root_path = Path("C:/Users/Charles/Desktop/MVA/GDA/Geom_stat/")
sys.path.insert(0, str(root_path))

from utils_tda_and_clustering import (
    homology_parquet_to_matrix_bootstraps,
    make_pca_bootstraps,
    get_clusters,
    transform_gleason,
    meta_clustering,
    choose_representative_bootstrap,
    get_meta_bootstraps
)

img_path = root_path.joinpath("raw_images")
saving_path = root_path.joinpath("outputs")

In [2]:
base_name = "b1"
base_path = str(saving_path.joinpath(f"{base_name}.parquet"))
df_ident, bootstraps, original = homology_parquet_to_matrix_bootstraps(base_path)

In [3]:
with open("df_ident.pickle", "rb") as f:
    df_ident = pickle.load(f)
with open("bootstraps.pickle", "rb") as f:
    bootstraps = pickle.load(f)
with open("original.pickle", "rb") as f:
    original = pickle.load(f)

In [4]:
reduced_bootstraps, reduced_original, vars_explained = make_pca_bootstraps(bootstraps, original)

In [5]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist
from gap_statistic import OptimalK

def optimalK(reduced_bootstraps,  reduced_original, minClusters=2, maxClusters=10):
    """
    Calculates KMeans optimal K using Gap Statistic 
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(minClusters, maxClusters)),))
    resultsdf = {}
    for gap_index, k in enumerate(range(minClusters, maxClusters)):# Holder for reference dispersion results
        refDisps = np.zeros(len(bootstraps))# For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i,b in tqdm(enumerate(reduced_bootstraps)):

            clustering_model = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
            clustering_model.fit(b["reduced"])
            clusters_indices = clustering_model.labels_

            clusters = [[b["reduced"][j] for j in range(len(clusters_indices)) if clusters_indices[j]==i] for i in range(k)]
            
            distances = []
            for c in clusters:
                D_c = np.sum(pdist(c, 'euclidean'))/(2*len(c))
                distances.append(D_c)
            
            bootstrapDisp = np.sum(distances) # The value of W_k for one of our bootstraps
            refDisps[i] = bootstrapDisp
        
        clustering_model = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
        clustering_model.fit(reduced_original["reduced"])
        clusters_indices = clustering_model.labels_
        
        clusters = [[reduced_original["reduced"][j] for j in range(len(clusters_indices)) if clusters_indices[j]==i] for i in range(k)]
        
        distances = []
        for c in clusters:
            D_c = np.sum(pdist(c, 'euclidean'))/(2*len(c))
            distances.append(D_c)
        
        origDisp = np.sum(distances)
        
        gap = np.mean(np.log(refDisps)) - np.log(origDisp)# Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf[k] = gap
    return (gaps.argmax() + minClusters, resultsdf)

In [6]:
# optimalK(reduced_bootstraps, reduced_original)

In [7]:
clustered_bootstraps, clustered_original = get_clusters(reduced_bootstraps, reduced_original)

In [8]:
from utils_tda_and_clustering import transform_gleason

In [9]:
gleason_bootstraps = transform_gleason(bootstraps)

In [10]:
clusters = []
for b in gleason_bootstraps:
    clusters = clusters + b["clusters"]
gleason_points = np.array([b["gleason_coords"] for b in gleason_bootstraps]).reshape(-1,3)

In [11]:
meta_clusters_indices = meta_clustering(gleason_points)
meta_clusters = [[clusters[k] for k in c] for c in meta_clusters_indices]

In [12]:
meta_clusters_gleason_coords = [[gleason_points[k] for k in c] for c in meta_clusters_indices]
representative_bootstrap = choose_representative_bootstrap(gleason_bootstraps, meta_clusters_gleason_coords)

In [13]:
meta_bootstraps = get_meta_bootstraps(meta_clusters)

In [14]:
meta_clusters_fused = []
for c in meta_clusters:
    for cluster in c:
        meta_clusters_fused.append(cluster)

In [15]:
from utils_tda_and_clustering import get_stability
stabilities = []
for b in tqdm(meta_bootstraps):
    stability = get_stability(b, meta_clusters_fused)
    stabilities.append(stability)

print(np.mean(stabilities), np.std(stabilities))

100%|██████████| 1000/1000 [2:48:48<00:00, 10.13s/it] 

0.25423914060456126 0.010459507106258568





In [16]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist
from gap_statistic import OptimalK

def optimalK_meta(data, n_bootstraps = 100, minClusters=1, maxClusters=10):
    """
    Calculates KMeans optimal K using Gap Statistic 
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(minClusters, maxClusters)),))
    gaps_sds = np.zeros_like(gaps)
    for gap_index, k in enumerate(range(minClusters, maxClusters)):# Holder for reference dispersion results
        refDisps = np.zeros(n_bootstraps)# For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(n_bootstraps):

            b = np.random.choice(len(data), len(data), replace=True)
            b = data[b]

            clustering_model = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
            clustering_model.fit(b)
            clusters_indices = clustering_model.labels_

            clusters = [[b[j] for j in range(len(clusters_indices)) if clusters_indices[j]==i] for i in range(k)]
            
            distances = []
            for c in clusters:
                D_c = np.sum(pdist(c, 'euclidean'))/(2*len(c))
                distances.append(D_c)
            
            bootstrapDisp = np.sum(distances) # The value of W_k for one of our bootstraps
            refDisps[i] = np.log(bootstrapDisp)
        
        clustering_model = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
        clustering_model.fit(data)
        clusters_indices = clustering_model.labels_
        
        clusters = [[data[j] for j in range(len(clusters_indices)) if clusters_indices[j]==i] for i in range(k)]
        
        distances = []
        for c in clusters:
            D_c = np.sum(pdist(c, 'euclidean'))/(2*len(c))
            distances.append(D_c)
        
        origDisp = np.sum(distances)
        
        gap = np.mean(refDisps) - np.log(origDisp)# Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap

        gaps_sds[gap_index] = np.std(refDisps)
        
    return (gaps, gaps_sds)