In [None]:


## author: Piotr Grabowski, 03.07.2016 for Kaggle, retro engineered to be compatible with python 3 made by Pietro Gavazzi
# https://www.kaggle.com/code/piotrgrabo/kmeans-example

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import metrics
import re
from sklearn.impute import SimpleImputer
from numpy import random
import seaborn as sb
import matplotlib.pyplot as plt 




In [None]:
### Set path to the data set
dataset_path = "./dataset/77_cancer_proteomes_CPTAC_itraq.csv"
clinical_info = "./dataset/clinical_data_breast_cancer.csv"
pam50_proteins = "./dataset/PAM50_proteins.csv"

## Load data
data = pd.read_csv(dataset_path,header=0,index_col=0)
clinical_file = pd.read_csv(clinical_info,header=0,index_col=0)## holds clinical information about each patient/sample
pam50 = pd.read_csv(pam50_proteins,header=0)

# RefSeq protein ID (each protein has a unique ID in a RefSeq database)
print(data.index.name)
data.head()


In [None]:
## Drop unused information columns
data.drop(['gene_symbol','gene_name'],axis=1,inplace=True)


## Change the protein data sample names to a format matching the clinical data set
data.rename(columns=lambda x: "TCGA-%s" % (re.split('[_|-|.]',x)[0]) if bool(re.search("TCGA",x)) is True else x,inplace=True)

data.head()

In [None]:
## Transpose data for the clustering algorithm since we want to divide patient samples, not proteins
print(data.shape)
datat = data.transpose()
print(datat.shape)

datat.head()


In [None]:
print(len(clinical_file.index))
print(len(datat.index))

In [None]:
## Drop clinical entries for samples not in our protein data set
clinical = clinical_file.loc[[x for x in clinical_file.index.tolist() if x in datat.index],:]

print(clinical.shape)
clinical.head()

In [None]:
## Add clinical meta data to our protein data set, note: all numerical features for analysis start with NP_ or XP_
merged = datat.merge(clinical,left_index=True,right_index=True)


# Drop the duplicated columns (added by Pietro Gavazzi)
liste = merged.index.copy()
liste = list(liste)

for i in np.unique(merged.index):
    liste.remove(i)

## Change name to make it look nicer in the code!
processed = merged.drop(np.unique(liste))


processed.shape



In [None]:

from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(processed, random_state=42)
X_train.head()

In [None]:
## Numerical data for the algorithm, NP_xx/XP_xx are protein identifiers from RefSeq database
processed_numerical = processed.loc[:,[x for x in processed.columns if bool(re.search("NP_|XP_",x)) == True]]

## Select only the PAM50 proteins - known panel of genes used for breast cancer subtype prediction
processed_numerical_p50 = processed_numerical.iloc[:,processed_numerical.columns.isin(pam50['RefSeqProteinID'])]
processed_numerical_p50.head()




In [None]:
## Impute missing values (maybe another method would work better?)


processed_numerical_p50_bis = processed_numerical_p50.copy()

for date, row in processed_numerical_p50_bis.T.iteritems():
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer = imputer.fit(processed_numerical_p50_bis.loc[[date]].T)
    trow = imputer.transform(processed_numerical_p50_bis.loc[[date]].T)
    processed_numerical_p50_bis.loc[[date]] = trow.T


processed_numerical_p50_bis.head()



In [None]:
## Check which number of clusters works best, 20 and 79 are just for fun and comparison.
n_clusters = [2,3,4,5,6,7,8,10,20,len(processed_numerical_p50_bis)-1]

def compare_k_means(k_list,data):
    ## Run clustering with different k and check the metrics
    for k in k_list:
        clusterer = KMeans(n_clusters=k)
        clusterer.fit(data)
        ## The higher (up to 1) the better
        print("Silhouette Coefficient for k == %s: %s" % (
        k, round(metrics.silhouette_score(data, clusterer.labels_), 4)))
        ## The higher (up to 1) the better
        print("Homogeneity score for k == %s: %s" % (
        k, round(metrics.homogeneity_score(processed['PAM50 mRNA'], clusterer.labels_),4)))
        print("------------------------")

## What if we use a random set of 43 proteins? Will the clustering be as good?
## Create a random numerical matrix with imputation:
processed_numerical_random = processed_numerical.iloc[:,random.choice(range(processed_numerical.shape[1]),43)]
imputer_rnd = imputer.fit(processed_numerical_random)
processed_numerical_random = imputer_rnd.transform(processed_numerical_random)




In [None]:
## Check different numbers of clusters for the PAM50 proteins, there are 4 subtypes of cancer in this data
## 3 samples of healthy patients were dropped at the beginning...
compare_k_means(n_clusters,processed_numerical_p50_bis)
## seems that k==3 works good, the silhouette score is still high and the homogeneity score jumps ~2-fold
## this is what they report in the paper to be the best number of clusters!
## k == 79 has homogeneity score of 1.0, no wonder since the algorithm can assign all the points their separate clusters!
## However, for our application, such clustering would be worthless.




In [None]:
## Use random proteins for comparison
compare_k_means(n_clusters,processed_numerical_random)
## The scores should be significantly lower than for the PAM50 proteins!

In [None]:


## Visualize data using k==3, show the heatmap of protein expression for the used PAM50 proteins (43 available in our data)
clusterer_final = KMeans(n_clusters=4)
clusterer_final = clusterer_final.fit(processed_numerical_p50_bis)
clusterer_final.labels_ = [i*((10//max(clusterer_final.labels_))+1) for i in clusterer_final.labels_]


processed_p50_plot = pd.DataFrame(processed_numerical_p50_bis)
processed_p50_plot['KMeans_cluster'] = clusterer_final.labels_
processed_p50_plot.sort_values('KMeans_cluster',axis=0,inplace=True)

## Look at the heatmap of protein expression in all patients and look at their assigned cluster
## Proteins can either be more expressed (more is produced, less degraded), not changed or lower expressed than the used reference
## Since each protein has a distinct function in the cell, their levels describe the functional/signaling state the cell is in.


In [None]:
processed_p50_plot.index.name = 'Patient'
sb.heatmap(processed_p50_plot) ## The x-axis are the PAM50 proteins we used and the right-most column is the cluster marker
plt.savefig('cluster.png')
## Looks like the clustering works quite decently here!

## Each cluster means a different molecular signature for each patient. Such patients have different treatment options available
## to them!


In [None]:
# Part added by Pietro Gavazzi

In [None]:
# https://www.docteur-eric-sebban.fr/cancer-du-sein/diagnostic-cancer-sein/stades-et-types-de-cancer-du-sein/ 


# «T» pour la taille et l’infiltration de la tumeur, allant de Tx (tumeur non évaluable) à T4.
# «N» pour le degré d’envahissement des ganglions lymphatiques, allant de Nx (envahissement des ganglions non évaluable) à N3.




dataf = pd.DataFrame({ "label":clusterer_final.labels_}, index=processed.index, dtype=str)
dataf = pd.get_dummies(dataf)
tbm = pd.get_dummies(processed, columns=["Integrated Clusters (with PAM50)"]).T.loc[["Integrated Clusters (with PAM50)_1", "Integrated Clusters (with PAM50)_2", "Integrated Clusters (with PAM50)_3", "Integrated Clusters (with PAM50)_4"]].T
dataf = dataf.merge(tbm, left_index=True,right_index=True)
matrix = np.zeros((len(dataf.T), len(dataf.T)))

indi = 0
for i in dataf.T.index:
    indj = 0
    for j in dataf.T.index:
        matrix[indi][indj]+=np.array(dataf[i])@np.array(dataf[j])
        indj+=1
    indi+=1


for i in range(len(matrix)):
    matrix[i]/=matrix[i][i]

sb.heatmap(matrix, xticklabels=dataf.columns, yticklabels=dataf.columns, annot=True)