<a href="https://colab.research.google.com/github/PaoloGerosa/Chemotherapy-Associated-Liver-Injury/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Upload Packages**

In [None]:
import pandas as pd
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt

# **Confusion Matrix** 

In [None]:
# clustering_score is a function used to compute a dataframe. If number of inputs is 2 
# (outcome_dataset, predictive labels) the columns are
# the type of CALI and which rows are the predictive labels --> the value of the cells
# represent the number of patients belonging to group i that have CALI j.
# If number of inputs is 3 (outcome_dataset, predictive labels, type of CALI)
# then it simply gives the confusion matrix of the CALI given in input


def clustering_score(outcome, labels, patients, patients_index):
  confusion_matrix = pd.DataFrame(0, index=[i for i in range(max(labels) + 1)], columns=[col for col in outcome.columns])
  patients_index_aux = set(patients_index)
  for id, label in zip(patients, labels):
    if id in patients_index_aux:
      key = outcome_dataset.loc[id][combo]
      for col in outcome.columns:
        if outcome.loc[key][col]:
          confusion_matrix[col][label] += 1

  return confusion_matrix

In [None]:
#patients_index - dataset
#patients - volumi

# confusion_matrix takes in input the outcome_dataset and the predictide labels
# and gives in output the confusion matrix using the CALI column of the outcome_dataframe
def confusion_matrix(*args):
  if len(args) == 5:
    outcome, labels, combo, patients, patients_index = args
  else:
    outcome, labels, patients, patients_index = args
    combo = 'CALI'
  patients_index_aux = set(patients_index)
  confusion_matrix = pd.DataFrame(0, index=[0, 1], columns=[i for i in range(max(labels) + 1)])
  for id, label in zip(patients, labels):
    if id in patients_index_aux:
      key = outcome.loc[id][combo]
      if key is not None:
        confusion_matrix[label][key] += 1
  return confusion_matrix

In [None]:
# function to exctract the indices of the true positive and true negative patients

def diagonal_index(outcome, labels, combo, patients, patients_index):
  patients_index_aux = set(patients_index)
  true_positive = []
  true_negative = []
  for id, label in zip(patients, labels):
    if id in patients_index_aux:
      key = outcome.loc[id][combo]
      if key == 1 and label == 1:
        true_positive.append(id)
      elif key == 0 and label == 0:
        true_negative.append(id)
  return true_positive, true_negative

In [None]:
'''
def confusion_matrix(*args):
  if len(args) == 2:
    outcome, labels = args
    index = 0
    confusion_matrix = pd.DataFrame(0, index=[0, 1], columns=[i for i in range(max(labels) + 1)])
    for key in outcome["CALI"]:
      if key is not None:
        confusion_matrix[labels[index]][key] += 1
      index += 1
    return confusion_matrix

  elif len(args) == 3:
    outcome, labels, combo = args
    index = 0
    confusion_matrix = pd.DataFrame(0, index=[0, 1], columns=[i for i in range(max(labels) + 1)])
    for key in outcome[combo]:
      if key is not None and key >= 0:
        confusion_matrix[labels[index]][key] += 1
      index += 1
      
    return confusion_matrix
  '''

# **Clustering**

In [None]:
# clust_methods computes the Clustering labels given the distance matrix, the type
# of clustering technique and the number of clusters to be used

def clust_methods(dist,clust_type,n_centr):

  if clust_type == "Agglomerative": 
    model = AgglomerativeClustering(affinity='precomputed', n_clusters = n_centr, linkage='complete').fit(dist)
    model_labels = model.labels_
    
  elif clust_type == "Kmedoids": 
    model = KMedoids(n_clusters = n_centr, metric = 'precomputed').fit(dist)
    model_labels = model.predict(dist)
    
  elif clust_type == "DBSCAN":
    model = DBSCAN(eps = 1300, min_samples = n_centr, metric='precomputed').fit(dist)
    model_labels = model.labels_
  
  return model_labels
    


# **Optimal K**

In [None]:
# clust_methods computes the Clustering labels given the distance matrix, the type
# of clustering technique and the number of clusters to be used

def Silhouette_Analysis(dist, clust_type):
  range_n_clusters = [2,3,4,5,6,7]
  for k in range_n_clusters:
    cluster_labels = clust_methods(dist,clust_type,k)
    silhouette_avg = silhouette_score(dist, cluster_labels, metric = 'precomputed')
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)


In [None]:
def inertia (dist):
  Ks = range(1, 10)
  inertia = [KMedoids(i, metric = 'precomputed').fit(dist).inertia_ for i in Ks]

  fig = plt.figure()
  plt.plot(Ks, inertia, '-bo')
  plt.xlabel('Number of clusters')
  plt.ylabel('Inertia (within-cluster sum of squares)')
  plt.show()

# Confusion matrix with relative frequencies

In [None]:
def score_relative (outcome_dataset, matrix, num_1):
  matrix_rel = pd.DataFrame(columns = outcome_dataset.columns)
  row_to_add = pd.Series(matrix.iloc[0] / (125-num_1), name = 0)
  matrix_rel = matrix_rel.append(row_to_add)
  row_to_add = pd.Series(matrix.iloc[1] / num_1, name = 1)
  matrix_rel = matrix_rel.append(row_to_add)
  return matrix_rel