In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score
from scipy.spatial import ConvexHull
import scipy
import hyperopt

In [None]:
#Supress warnings
pd.options.mode.chained_assignment = None

In [None]:
def optimize_clustering(params, data, true_labels, manual_n_clusters=None):

    """
    params: the clustering parameters inputted via the bayesian_optimization function.
    data: the tsne dataframe
    true_labels: a pandas series of the Y column
    manual_n_clusters: To manually assign the number of clusters and overrule the optimization.
    
    Receives the suggested parameters for clustering algorithm, evaluates the performance and returns a loss score. 
    """
   
    linkage = params['linkage'] 
    affinity = 'euclidean' if linkage == 'ward' else params['affinity'] 
    n_clusters = manual_n_clusters if manual_n_clusters else int(params['n_clusters'])
    
    print(f'Testing hyperparameter combination: [Linkage: {linkage}, Affinity: {affinity}, n_clusters: {n_clusters}]')
    
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, metric=affinity, linkage=linkage)

    cluster_labels = clusterer.fit_predict(data)
    
    if len(np.unique(cluster_labels)) > 1:
        silhouette_avg = silhouette_score(data, cluster_labels)
        ari = adjusted_rand_score(true_labels, cluster_labels)
        score = 0.5 * silhouette_avg + 0.5 * ari
    
    else:
       score = -1
    
    
    return {'loss': -score, 'status': STATUS_OK}

In [None]:
def bayesian_optimization(data, true_labels, manual_n_clusters=None, random_number=42):
    
    """
    data: the tsne dataframe from the following cluster function input
    true_labels: a pandas series of the Y column from the following cluster function input
    """
    
    space = {
           'n_clusters': hp.quniform('n_clusters', 2, 20, 1),
           'affinity': hp.choice('affinity', ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']),
           'linkage': hp.choice('linkage', ['ward', 'complete', 'average', 'single'])
       }

    trials = Trials()

    rng = np.random.default_rng(random_number)

    best = fmin(fn=lambda params: optimize_clustering(params, data, true_labels, manual_n_clusters),
               space=space,
               algo=tpe.suggest,
               max_evals=200,
               trials=trials,
               rstate=rng)

    
    best_params = { 
        'n_clusters': manual_n_clusters if manual_n_clusters else int(best['n_clusters']), 
        'linkage': ['ward', 'complete', 'average', 'single'][best['linkage']],
        'affinity': 'euclidean' if ['ward', 'complete', 'average', 'single'][best['linkage']] == 'ward' 
                    else ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'][best['affinity']]
    }


    return best_params

In [None]:
def cluster_bootstrap(tsne_df, y_param,  manual_param=None, random_number=42):
    """
    tsne_df: a pandas dataframe containing only the two columns of tsne components
    y_param: a pandas dataseries (one column) containg the ground truth labels ['environment'] or ['tumtype']
    manual_param: if all the three parameters are inputted manually: {'n_clusters':int, 'linkage':str, 'affinity':str} #To not repeat the optimization
    random_number: random state for Bayesian-optimization
    """
    
    if manual_param == None:
        opt_params = bayesian_optimization(data=tsne_df, true_labels=y_param, random_number=random_number)
        print(f"Optimized Agglomerative clustering parameters: ", opt_params)
    else:
        opt_params = manual_param
        print(f"Manual Agglomerative clustering parameters: ", opt_params)
        

    clusterer = AgglomerativeClustering(n_clusters=opt_params['n_clusters'], metric=opt_params['affinity'], linkage=opt_params['linkage'])
    

    cluster_labels = clusterer.fit_predict(tsne_df)
    cluster_labels = [c+1 for c in cluster_labels] # to not start with cluster 0
    tsne_df['cluster'] = cluster_labels
    

    le = LabelEncoder()
    encoded_y_param = le.fit_transform(y_param)
    

    ari = adjusted_rand_score(encoded_y_param, cluster_labels)
    sil = silhouette_score(tsne_df[[tsne_df.columns[0], tsne_df.columns[1]]] , cluster_labels)
    

    return ari, sil, opt_params

In [None]:
def bootstrap(df, Y_param_col_nam:str, manual_param=None, n_iterations=100, random_number=42):
    """
    df: a pandas df with tsne component columns named as ['principal component 1', 'principal component 2'] and Y column
    Y_param_col_nam: name of the column containing the ground truth (tumtype or environment)
    """
    
    bootstrap_results = {}
    for iter in range(n_iterations):
        print("###########################################")
        print(f'Processing Iteration Number {iter + 1}/{n_iterations}')
        sample = df.sample(n=len(df), replace=True, random_state=iter)
        print(f"deduplicated number of rows: {sample.drop_duplicates().shape[0]}")
        ari, sil, opt_params = cluster_bootstrap(tsne_df=sample[['principal component 1', 'principal component 2']], y_param=sample[Y_param_col_nam], manual_param=manual_param, random_number=random_number)
        print(f"RESULTS: ARI = {ari}, Silhouette = {sil}")
        bootstrap_results[f'{iter + 1}'] = {'ARI': ari, 'Silhouette': sil, 'ClusteringParams': opt_params}
        bootstrap_df = pd.DataFrame.from_dict(bootstrap_results, orient='index')
        bootstrap_df.reset_index(inplace=True)
        bootstrap_df.rename(columns={'index': 'Iteration'}, inplace=True)
    return bootstrap_df

### Bootstrap Clustering

BS Discovery - all lesions - all features

In [None]:
tsne_discovery_allfeat = pd.read_csv('LOCAL PATH')
tsne_discovery_allfeat

In [None]:
y_name = 'environment'
bootstrap_discovery_allfeat_all_env = bootstrap(df=tsne_discovery_allfeat, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_all_env

In [None]:
bootstrap_discovery_allfeat_all_env.to_excel('LOCAL PATH')

In [None]:
y_name = 'tumtype'
bootstrap_discovery_allfeat_all_tumtype = bootstrap(df=tsne_discovery_allfeat, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_all_tumtype

In [None]:
bootstrap_discovery_allfeat_all_tumtype.to_excel('LOCAL PATH')

BS Discovery - metastatic lesions - all features

In [None]:
tsne_discovery_allfeat_met = tsne_discovery_allfeat[tsne_discovery_allfeat['prim_met_status'].isin(['Metastatic', 'Recurrence'])].reset_index(drop=True)
tsne_discovery_allfeat_met

In [None]:
y_name = 'environment'
bootstrap_discovery_allfeat_met_env = bootstrap(df=tsne_discovery_allfeat_met, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_met_env

In [None]:
bootstrap_discovery_allfeat_met_env.to_excel('LOCAL PATH')

In [None]:
y_name = 'tumtype'
bootstrap_discovery_allfeat_met_tumtype = bootstrap(df=tsne_discovery_allfeat_met, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_met_tumtype

In [None]:
bootstrap_discovery_allfeat_met_tumtype.to_excel('LOCAL PATH')

BS Discovery - solid metastatic lesions - all features

In [None]:
tsne_discovery_allfeat_solidmet = tsne_discovery_allfeat_met[tsne_discovery_allfeat_met['lesion_type'] != 'Lymph-Node']
tsne_discovery_allfeat_solidmet

In [None]:
y_name = 'environment'
bootstrap_discovery_allfeat_solidmet_env = bootstrap(df=tsne_discovery_allfeat_solidmet, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_solidmet_env

In [None]:
bootstrap_discovery_allfeat_solidmet_env.to_excel(r'LOCAL PATH')

In [None]:
y_name = 'tumtype'
bootstrap_discovery_allfeat_solidmet_tumtype = bootstrap(df=tsne_discovery_allfeat_solidmet, Y_param_col_nam=y_name)
bootstrap_discovery_allfeat_solidmet_tumtype

In [None]:
bootstrap_discovery_allfeat_solidmet_tumtype.to_excel(r'LOCAL PATH')

Repeat the steps above for eroded, radiomics subsets, as well as validation datasets!