In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, adjusted_rand_score
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt
import scipy
import hyperopt

In [None]:
#Supress warnings
pd.options.mode.chained_assignment = None

In [None]:
def variation_of_information(labels_true, labels_pred):
   labels_true = np.asarray(labels_true, dtype=np.int32)
   labels_pred = np.asarray(labels_pred, dtype=np.int32)
   contingency = np.histogram2d(labels_true, labels_pred, bins=(np.max(labels_true) + 1, np.max(labels_pred) + 1))[0]
   H_true = -np.sum(contingency.sum(axis=1) / len(labels_true) * np.log(contingency.sum(axis=1) / len(labels_true)))
   H_pred = -np.sum(contingency.sum(axis=0) / len(labels_pred) * np.log(contingency.sum(axis=0) / len(labels_pred)))
   I = mutual_info_score(labels_true, labels_pred)
   return H_true + H_pred - 2 * I

In [None]:
def optimize_clustering(params, data, true_labels, manual_n_clusters=None):

    """
    params: the clustering parameters inputted via the bayesian_optimization function.
    data: the tsne dataframe
    true_labels: a pandas series of the Y column
    manual_n_clusters: To manually assign the number of clusters and overrule the optimization.
    
    Receives the suggested parameters for clustering algorithm, evaluates the performance and returns a loss score. 
    """
   
    linkage = params['linkage'] 
    affinity = 'euclidean' if linkage == 'ward' else params['affinity'] 
    n_clusters = manual_n_clusters if manual_n_clusters else int(params['n_clusters'])
    
    print(f'Testing hyperparameter combination: [Linkage: {linkage}, Affinity: {affinity}, n_clusters: {n_clusters}]')
    
    clusterer = AgglomerativeClustering(n_clusters=n_clusters, metric=affinity, linkage=linkage)

    cluster_labels = clusterer.fit_predict(data)
    
    if len(np.unique(cluster_labels)) > 1:
        silhouette_avg = silhouette_score(data, cluster_labels)
        ari = adjusted_rand_score(true_labels, cluster_labels)
        score = 0.5 * silhouette_avg + 0.5 * ari
    
    else:
       score = -1
    
    
    return {'loss': -score, 'status': STATUS_OK}

In [None]:
def bayesian_optimization(data, true_labels, manual_n_clusters=None, random_number=42):
    
    """
    data: the tsne dataframe from the following cluster function input
    true_labels: a pandas series of the Y column from the following cluster function input
    """
    
    space = {
           'n_clusters': hp.quniform('n_clusters', 2, 20, 1),
           'affinity': hp.choice('affinity', ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']),
           'linkage': hp.choice('linkage', ['ward', 'complete', 'average', 'single'])
       }

    trials = Trials()

    rng = np.random.default_rng(random_number)

    best = fmin(fn=lambda params: optimize_clustering(params, data, true_labels, manual_n_clusters),
               space=space,
               algo=tpe.suggest,
               max_evals=200,
               trials=trials,
               rstate=rng)

    
    best_params = { 
        'n_clusters': manual_n_clusters if manual_n_clusters else int(best['n_clusters']), 
        'linkage': ['ward', 'complete', 'average', 'single'][best['linkage']],
        'affinity': 'euclidean' if ['ward', 'complete', 'average', 'single'][best['linkage']] == 'ward' 
                    else ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'][best['affinity']]
    }


    return best_params

In [None]:
def cluster_original(tsne_df, y_param,  manual_param=None, random_number=42):
    """
    tsne_df: a pandas dataframe containing only the two columns of tsne components
    y_param: a pandas dataseries (one column) containg the ground truth labels ['environment'] or ['tumtype']
    manual_param: if all the three parameters are inputted manually: {'n_clusters':int, 'linkage':str, 'affinity':str} #To not repeat the optimization
    random_number: random state for Bayesian-optimization
    """
    
    
    if manual_param == None:
        opt_params = bayesian_optimization(data=tsne_df, true_labels=y_param, random_number=random_number)
        print(f"Optimized Agglomerative clustering parameters: ", opt_params)
    else:
        opt_params = manual_param
        print(f"Manual Agglomerative clustering parameters: ", opt_params)
        

    clusterer = AgglomerativeClustering(n_clusters=opt_params['n_clusters'], metric=opt_params['affinity'], linkage=opt_params['linkage'])
    

    cluster_labels = clusterer.fit_predict(tsne_df)
    cluster_labels = [c+1 for c in cluster_labels] # to not start with cluster 0
    tsne_df['cluster'] = cluster_labels
    tsne_df[y_param.name] = y_param
    

    le = LabelEncoder()
    encoded_y_param = le.fit_transform(y_param)
    

    ari = adjusted_rand_score(encoded_y_param, cluster_labels)
    sil = silhouette_score(tsne_df[[tsne_df.columns[0], tsne_df.columns[1]]] , cluster_labels)
    contingency_table = pd.crosstab(pd.Series(y_param, name='Ground Truth'), pd.Series(cluster_labels, name='Cluster'))
    contingency_table_percentage = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100
    contingency_table_inverse_percentage = contingency_table.div(contingency_table.sum(axis=0), axis=1) * 100


    return {'Dataframe': tsne_df,
            'ClusteringParams': opt_params,
            'ARI': ari, 
            'Silhouette': sil, 
            'ContingTable': contingency_table, 
            'ContingTablePerc': contingency_table_percentage, 
            'ContingTableInvPerc': contingency_table_inverse_percentage, 
           }

##################################################################################################################################

### Original Clustering

Discovery - all features

All lesions:

In [None]:
tsne_discovery_allfeat_all = pd.read_csv('LOCAL PATH')
tsne_discovery_allfeat_all

In [None]:
tsne_df = tsne_discovery_allfeat_all[['tsne 1', 'tsne 2']]
y_param = tsne_discovery_allfeat_all['environment']

results_discovery_allfeat_all_env = cluster_original(tsne_df=tsne_df, y_param=y_param)

In [None]:
tsne_df = tsne_discovery_allfeat_all[['tsne 1', 'tsne 2']]
y_param = tsne_discovery_allfeat_all['tumtype']

results_discovery_allfeat_all_tumtype = cluster_original(tsne_df=tsne_df, y_param=y_param)

Metastatic lesions:

In [None]:
tsne_discovery_allfeat_met = tsne_discovery_allfeat_all[tsne_discovery_allfeat_all['prim_met_status'].isin(['Metastatic', 'Recurrence'])]
tsne_discovery_allfeat_met

In [None]:
tsne_df = tsne_discovery_allfeat_met[['tsne 1', 'tsne 2']].reset_index(drop=True)
y_param = tsne_discovery_allfeat_met['environment'].reset_index(drop=True)

results_discovery_allfeat_met_env = cluster_original(tsne_df=tsne_df, y_param=y_param)

In [None]:
tsne_df = tsne_discovery_allfeat_met[['tsne 1', 'tsne 2']].reset_index(drop=True)
y_param = tsne_discovery_allfeat_met['tumtype'].reset_index(drop=True)

results_discovery_allfeat_met_tumtype = cluster_original(tsne_df=tsne_df, y_param=y_param)

Solid metastatic lesions:

In [None]:
tsne_discovery_allfeat_solidmet = tsne_discovery_allfeat_met[tsne_discovery_allfeat_met['lesion_type'] != 'Lymph-Node']
tsne_discovery_allfeat_solidmet

In [None]:
tsne_df = tsne_discovery_allfeat_solidmet[['tsne 1', 'tsne 2']].reset_index(drop=True)
y_param = tsne_discovery_allfeat_solidmet['environment'].reset_index(drop=True)

results_discovery_allfeat_solidmet_env = cluster_original(tsne_df=tsne_df, y_param=y_param)

In [None]:
tsne_df = tsne_discovery_allfeat_solidmet[['tsne 1', 'tsne 2']].reset_index(drop=True)
y_param = tsne_discovery_allfeat_solidmet['tumtype'].reset_index(drop=True)

results_discovery_allfeat_solidmet_tumtype = cluster_original(tsne_df=tsne_df, y_param=y_param)

Repeat the steps above for eroded, radiomics subsets, as well as validation datasets!