In [1]:
!pip install lifelines
!pip install pyckmeans
!pip install wandb -qU

Collecting lifelines
  Downloading lifelines-0.28.0-py3-none-any.whl (349 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/349.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m194.6/349.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.1-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: auto

In [2]:
import numpy as np
import pandas as pd
from pyckmeans import CKmeans
import matplotlib.pyplot as plt

from lifelines.fitters.coxph_fitter import CoxPHFitter

from google.colab import drive

import wandb
import os

In [3]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# --- Load raw BRCA data ---
drive.mount('/content/gdrive')
!unzip gdrive/MyDrive/TCGA-BRCA_1079.zip

Mounted at /content/gdrive
Archive:  gdrive/MyDrive/TCGA-BRCA_1079.zip
  inflating: TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv  


In [5]:
def drop_elements_from_cluster(cluster, consensus_matrix):
  new_cluster = []
  for j in range(0, len(cluster)):
    if consensus_matrix[cluster[0]][cluster[j]] == 1:
      new_cluster.append(cluster[j])
  return new_cluster

In [6]:
def convert_indices_to_binary_list(number_elements, cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5):
  cluster0_column = []
  cluster1_column = []
  cluster2_column = []
  cluster3_column = []
  cluster4_column = []
  cluster5_column = []
  for k in range(0, number_elements):
    if k in cluster_0:
      cluster0_column.append(1)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)
      cluster4_column.append(0)
      cluster5_column.append(0)
    elif k in cluster_1:
      cluster0_column.append(0)
      cluster1_column.append(1)
      cluster2_column.append(0)
      cluster3_column.append(0)
      cluster4_column.append(0)
      cluster5_column.append(0)
    elif k in cluster_2:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(1)
      cluster3_column.append(0)
      cluster4_column.append(0)
      cluster5_column.append(0)
    elif k in cluster_3:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(1)
      cluster4_column.append(0)
      cluster5_column.append(0)
    elif k in cluster_4:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)
      cluster4_column.append(1)
      cluster5_column.append(0)
    elif k in cluster_5:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)
      cluster4_column.append(0)
      cluster5_column.append(1)
    else:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)
      cluster4_column.append(0)
      cluster5_column.append(0)

  return [cluster0_column, cluster1_column, cluster2_column, cluster3_column, cluster4_column, cluster5_column]

In [7]:
def fit_cox_model(cluster, data_frame):
  print("Original dataframe shape: ", data_frame.shape)
  cluster_frame = pd.DataFrame({'cluster': cluster})
  # Concatenate cluster membership to data table
  if 'cluster' in data_frame.columns.values:
    data_frame = data_frame.drop('cluster', axis = 1) # Drop columns that could have been created previously
  data_frame = pd.concat([data_frame, cluster_frame], axis = 1)

  # Get rows with null entry
  null_mask = data_frame.isnull().any(axis=1)
  null_rows = data_frame[null_mask]
  print(null_rows)

  # Remove rows with null entry
  data_frame.dropna(inplace=True) # remove all rows with any null value
  print("Final dataframe shape: ", data_frame.shape)

  # Fit Cox model
  cph = CoxPHFitter()
  cph.fit(data_frame, duration_col = 'OS.time', event_col = 'OS')
  cph.print_summary()

  # exp(coef) <=> hazard ratio // AN ESTIMATE OF THE TRUE HAZARD RATIO. IT HAS A STANDARD ERROR ASSOCIATED WITH IT.
  # A one unit increase in the covariate will increase the hazard by the hazard ratio

  # Every coefficient comes with a p-value
  # The p-value represents the probability of observing this coefficient in a sample if the null hypothesis was true.
  # The null hypothesis states that the coefficient=0, meaning that the predictor variable does not influence the hazard rate (occurence of the event)

  # The CoxPHFitter computes p-values using the chi-squared test.
  # The reference is in "Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition", page 256

  p_values = cph._compute_p_values()
  hazard_ratios = cph.hazard_ratios_.tolist()
  coefficients_ci = cph.confidence_intervals_
  print(p_values)
  print(hazard_ratios)
  print(coefficients_ci)
  print("p-value:", p_values[4])
  print("hazard-ratio:", hazard_ratios[4])

  return p_values[4], hazard_ratios[4]

In [10]:
for i in range (1, 2):
  data_frame = pd.read_csv("TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv", sep="\t")
  data_frame = data_frame[['age', 'OS', 'OS.time', 'stage_2', 'stage_3', 'stage_4']]

  #name_embedding = "VAE_Z_EM_Mean_{i}".format(i=i)
  name_embedding = "UMAP_EM"
  embeddings = np.loadtxt(name_embedding+".txt")
  print("Embeddings shape: ", embeddings.shape)
  print("------------- i =", i, "--------------------")
  number_of_clusters = 6
  ckm = CKmeans(k=number_of_clusters, n_rep=20, p_samp=1, p_feat=1)
  ckm.fit(embeddings)
  ckm_results = ckm.predict(embeddings, linkage_type='average')
  print(ckm_results.cl)
  consensus_matrix = ckm_results.cmatrix
  print(consensus_matrix)

  cluster_0 = []
  cluster_1 = []
  cluster_2 = []
  cluster_3 = []
  cluster_4 = []
  cluster_5 = []
  for j in range(0, ckm_results.cl.shape[0]):
    if ckm_results.cl[j] == 1:
      cluster_1.append(j)
    elif ckm_results.cl[j] == 2:
      cluster_2.append(j)
    elif ckm_results.cl[j] == 3:
      cluster_3.append(j)
    elif ckm_results.cl[j] == 4:
      cluster_4.append(j)
    elif ckm_results.cl[j] == 5:
      cluster_5.append(j)
    elif ckm_results.cl[j] == 0:
      cluster_0.append(j)
  #print(cluster_0)
  #print(cluster_1)
  #print(cluster_2)
  #print(cluster_3)
  #print(cluster_4)
  #print(cluster_5)
  print("Cluster sizes: ", len(cluster_0), ", ", len(cluster_1), ", ", len(cluster_2), ", ", len(cluster_3), ", ", len(cluster_4), ", ", len(cluster_5))
  print("Sum of cluster sizes: ", len(cluster_0)+len(cluster_1)+len(cluster_2)+len(cluster_3)+len(cluster_4)+len(cluster_5))

  cluster_0 = drop_elements_from_cluster(cluster_0, consensus_matrix)
  cluster_1 = drop_elements_from_cluster(cluster_1, consensus_matrix)
  cluster_2 = drop_elements_from_cluster(cluster_2, consensus_matrix)
  cluster_3 = drop_elements_from_cluster(cluster_3, consensus_matrix)
  cluster_4 = drop_elements_from_cluster(cluster_4, consensus_matrix)
  cluster_5 = drop_elements_from_cluster(cluster_5, consensus_matrix)
  print("Cluster sizes after dropping:", len(cluster_0), ", ", len(cluster_1), ", ", len(cluster_2), ", ", len(cluster_3), ", ", len(cluster_4), ", ", len(cluster_5))

  cluster_column_list = convert_indices_to_binary_list(embeddings.shape[0], cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5)
  print("Column_lengths: ",  len(cluster_column_list[0]))

  for j in range(0, number_of_clusters):
    if cluster_column_list[j].count(1) > 10:
      print("Cluster size in column form:", cluster_column_list[j].count(1))
      p_value, hazard_ratio = fit_cox_model(cluster_column_list[j], data_frame)
      print("p-value:", p_value)
      print("hazard ratio:", hazard_ratio)
      cluster_name = "{base}_{j}".format(base=name_embedding, j=j)
      run = wandb.init(project="SURVIVAL_ANALAYSIS_k=6", # Set the project where this run will be logged
                        name=cluster_name
      )

      wandb.log({
                "p_value": p_value,
                "hazard_ratio": hazard_ratio,
                "cluster_size": cluster_column_list[j].count(1)
            })

      if p_value < 0.001:
        file_name = cluster_name + ".txt"
        np.savetxt(file_name, cluster_column_list[j])



Embeddings shape:  (1079, 10)
------------- i = 1 --------------------
[3 1 1 ... 1 2 2]
[[1.   0.   0.   ... 0.   0.   0.15]
 [0.   1.   0.85 ... 0.8  0.25 0.4 ]
 [0.   0.85 1.   ... 0.65 0.4  0.55]
 ...
 [0.   0.8  0.65 ... 1.   0.15 0.3 ]
 [0.   0.25 0.4  ... 0.15 1.   0.5 ]
 [0.15 0.4  0.55 ... 0.3  0.5  1.  ]]
Cluster sizes:  176 ,  134 ,  110 ,  312 ,  111 ,  236
Sum of cluster sizes:  1079
Cluster sizes after dropping: 167 ,  8 ,  1 ,  22 ,  1 ,  22
Column_lengths:  1079
Cluster size in column form: 167
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        0
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    5

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-749.03
time fit was run,2024-03-07 19:47:16 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.36,<0.005,23.54
stage_2,0.6,1.83,0.28,0.05,1.15,1.05,3.17,0.0,2.15,0.03,4.98
stage_3,1.33,3.78,0.3,0.75,1.91,2.12,6.76,0.0,4.49,<0.005,17.12
stage_4,2.44,11.48,0.37,1.72,3.16,5.58,23.63,0.0,6.63,<0.005,34.78
cluster,0.14,1.15,0.25,-0.35,0.63,0.71,1.87,0.0,0.56,0.57,0.81

0,1
Concordance,0.76
Partial AIC,1508.05
log-likelihood ratio test,75.87 on 5 df
-log2(p) of ll-ratio test,47.22


[34m[1mwandb[0m: Currently logged in as: [33mcosybio-compsysmed[0m. Use [1m`wandb login --relogin`[0m to force relogin


[8.20868421e-08 3.17267157e-02 7.04378675e-06 3.39043480e-11
 5.72318198e-01]
[1.0372816524052657, 1.828850735211504, 3.7832788076862527, 11.480635583786384, 1.1503572818404937]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.023225         0.049982
stage_2           0.052805         1.154570
stage_3           0.750070         1.911112
stage_4           1.718992         3.162331
cluster          -0.346143         0.626288
p-value: 0.5723181979821359
hazard-ratio: 1.1503572818404937
p-value: 0.5723181979821359
hazard ratio: 1.1503572818404937


Cluster size in column form: 22
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        1
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      NaN      NaN        0
397   43.0  1.0   3262.0      NaN      NaN      NaN        0
459   46.0  1.0    749.0      NaN      NaN      NaN        0
463   90.0  1.0   1542.0      NaN      NaN      NaN        1
470   45.0  1.0   2573.0      NaN      NaN      NaN        0
474   61.0  0.0   7777.0      NaN      NaN      NaN        0
479   57.0  1.0   2373.0      NaN      NaN      NaN        0
482   73.0  1.0   3126.0      NaN      NaN      NaN        0
488   58.0  1.0 

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-748.98
time fit was run,2024-03-07 19:47:19 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.34,<0.005,23.37
stage_2,0.61,1.84,0.28,0.06,1.16,1.06,3.2,0.0,2.17,0.03,5.07
stage_3,1.31,3.71,0.3,0.73,1.89,2.08,6.63,0.0,4.44,<0.005,16.75
stage_4,2.42,11.23,0.37,1.7,3.14,5.47,23.07,0.0,6.59,<0.005,34.38
cluster,-0.58,0.56,1.01,-2.55,1.4,0.08,4.05,0.0,-0.57,0.57,0.82

0,1
Concordance,0.76
Partial AIC,1507.97
log-likelihood ratio test,75.96 on 5 df
-log2(p) of ll-ratio test,47.28


[9.24179615e-08 2.97553713e-02 9.07228806e-06 4.47537195e-11
 5.67865641e-01]
[1.0368050017015507, 1.8422830860012573, 3.7136571170009938, 11.233081116648682, 0.5623255084499026]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.022880         0.049407
stage_2           0.059986         1.162026
stage_3           0.732611         1.891423
stage_4           1.699175         3.138551
cluster          -2.550999         1.399651
p-value: 0.567865641314389
hazard-ratio: 0.5623255084499026
p-value: 0.567865641314389
hazard ratio: 0.5623255084499026


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cluster_size,▁
hazard_ratio,▁
p_value,▁

0,1
cluster_size,167.0
hazard_ratio,1.15036
p_value,0.57232


Cluster size in column form: 22
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        0
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      NaN      NaN        0
397   43.0  1.0   3262.0      NaN      NaN      NaN        0
459   46.0  1.0    749.0      NaN      NaN      NaN        0
463   90.0  1.0   1542.0      NaN      NaN      NaN        0
470   45.0  1.0   2573.0      NaN      NaN      NaN        0
474   61.0  0.0   7777.0      NaN      NaN      NaN        0
479   57.0  1.0   2373.0      NaN      NaN      NaN        0
482   73.0  1.0   3126.0      NaN      NaN      NaN        0
488   58.0  1.0 

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-749.13
time fit was run,2024-03-07 19:47:26 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.33,<0.005,23.31
stage_2,0.61,1.84,0.28,0.06,1.16,1.06,3.19,0.0,2.17,0.03,5.05
stage_3,1.33,3.77,0.3,0.75,1.91,2.11,6.75,0.0,4.47,<0.005,17.0
stage_4,2.43,11.32,0.37,1.71,3.15,5.51,23.25,0.0,6.61,<0.005,34.61
cluster,-0.17,0.85,0.51,-1.17,0.84,0.31,2.31,0.0,-0.33,0.74,0.43

0,1
Concordance,0.76
Partial AIC,1508.25
log-likelihood ratio test,75.68 on 5 df
-log2(p) of ll-ratio test,47.08


[9.63259827e-08 3.02348046e-02 7.64511854e-06 3.82111156e-11
 7.43476933e-01]
[1.0369129948375997, 1.8386852519280719, 3.77235198872583, 11.32254518430868, 0.8454734939518178]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.022928         0.049568
stage_2           0.058189         1.159912
stage_3           0.746178         1.909220
stage_4           1.707307         3.146284
cluster          -1.173188         0.837471
p-value: 0.743476932709636
hazard-ratio: 0.8454734939518178
p-value: 0.743476932709636
hazard ratio: 0.8454734939518178


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cluster_size,▁
hazard_ratio,▁
p_value,▁

0,1
cluster_size,22.0
hazard_ratio,0.56233
p_value,0.56787
