In [1]:
!pip install lifelines
!pip install pyckmeans
!pip install wandb -qU

Collecting lifelines
  Downloading lifelines-0.28.0-py3-none-any.whl (349 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.1-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=4bdffa6af1e7fbebb24f40409e3dbffd17577c7954b72e5a3c2d7c92de7eca7f
  Stored in

In [2]:
import numpy as np
import pandas as pd
from pyckmeans import CKmeans
import matplotlib.pyplot as plt

from lifelines.fitters.coxph_fitter import CoxPHFitter

from google.colab import drive

import wandb
import os

In [3]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# --- Load raw BRCA data ---
drive.mount('/content/gdrive')
!unzip gdrive/MyDrive/TCGA-BRCA_1079.zip

Mounted at /content/gdrive
Archive:  gdrive/MyDrive/TCGA-BRCA_1079.zip
  inflating: TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv  


In [5]:
def drop_elements_from_cluster(cluster, consensus_matrix):
  new_cluster = []
  for j in range(0, len(cluster)):
    if consensus_matrix[cluster[0]][cluster[j]] == 1:
      new_cluster.append(cluster[j])
  return new_cluster

In [6]:
def convert_indices_to_binary_list(number_elements, cluster_0, cluster_1, cluster_2, cluster_3):
  cluster0_column = []
  cluster1_column = []
  cluster2_column = []
  cluster3_column = []
  for k in range(0, number_elements):
    if k in cluster_0:
      cluster0_column.append(1)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)
    elif k in cluster_1:
      cluster0_column.append(0)
      cluster1_column.append(1)
      cluster2_column.append(0)
      cluster3_column.append(0)
    elif k in cluster_2:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(1)
      cluster3_column.append(0)
    elif k in cluster_3:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(1)
    else:
      cluster0_column.append(0)
      cluster1_column.append(0)
      cluster2_column.append(0)
      cluster3_column.append(0)

  return [cluster0_column, cluster1_column, cluster2_column, cluster3_column]

In [7]:
def fit_cox_model(cluster, data_frame):
  print("Original dataframe shape: ", data_frame.shape)
  cluster_frame = pd.DataFrame({'cluster': cluster})
  # Concatenate cluster membership to data table
  if 'cluster' in data_frame.columns.values:
    data_frame = data_frame.drop('cluster', axis = 1) # Drop columns that could have been created previously
  data_frame = pd.concat([data_frame, cluster_frame], axis = 1)

  # Get rows with null entry
  null_mask = data_frame.isnull().any(axis=1)
  null_rows = data_frame[null_mask]
  print(null_rows)

  # Remove rows with null entry
  data_frame.dropna(inplace=True) # remove all rows with any null value
  print("Final dataframe shape: ", data_frame.shape)

  # Fit Cox model
  cph = CoxPHFitter()
  cph.fit(data_frame, duration_col = 'OS.time', event_col = 'OS')
  cph.print_summary()

  # exp(coef) <=> hazard ratio // AN ESTIMATE OF THE TRUE HAZARD RATIO. IT HAS A STANDARD ERROR ASSOCIATED WITH IT.
  # A one unit increase in the covariate will increase the hazard by the hazard ratio

  # Every coefficient comes with a p-value
  # The p-value represents the probability of observing this coefficient in a sample if the null hypothesis was true.
  # The null hypothesis states that the coefficient=0, meaning that the predictor variable does not influence the hazard rate (occurence of the event)

  # The CoxPHFitter computes p-values using the chi-squared test.
  # The reference is in "Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition", page 256

  p_values = cph._compute_p_values()
  hazard_ratios = cph.hazard_ratios_.tolist()
  coefficients_ci = cph.confidence_intervals_
  print(p_values)
  print(hazard_ratios)
  print(coefficients_ci)
  print("p-value:", p_values[4])
  print("hazard-ratio:", hazard_ratios[4])

  return p_values[4], hazard_ratios[4]

In [10]:
for i in range (1, 2):
  data_frame = pd.read_csv("TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv", sep="\t")
  data_frame = data_frame[['age', 'OS', 'OS.time', 'stage_2', 'stage_3', 'stage_4']]

  #name_embedding = "VAE_Z_EM_Mean_{i}".format(i=i)
  name_embedding = "UMAP_EM"
  embeddings = np.loadtxt(name_embedding+".txt")
  print("Embeddings shape: ", embeddings.shape)
  print("------------- i =", i, "--------------------")
  number_of_clusters = 4
  ckm = CKmeans(k=number_of_clusters, n_rep=20, p_samp=1, p_feat=1)
  ckm.fit(embeddings)
  ckm_results = ckm.predict(embeddings, linkage_type='average')
  print(ckm_results.cl)
  consensus_matrix = ckm_results.cmatrix
  print(consensus_matrix)

  cluster_0 = []
  cluster_1 = []
  cluster_2 = []
  cluster_3 = []
  for j in range(0, ckm_results.cl.shape[0]):
    if ckm_results.cl[j] == 1:
      cluster_1.append(j)
    elif ckm_results.cl[j] == 2:
      cluster_2.append(j)
    elif ckm_results.cl[j] == 3:
      cluster_3.append(j)
    elif ckm_results.cl[j] == 0:
      cluster_0.append(j)
  #print(cluster_0)
  #print(cluster_1)
  #print(cluster_2)
  #print(cluster_3)
  print("Cluster sizes: ", len(cluster_0), ", ", len(cluster_1), ", ", len(cluster_2), ", ", len(cluster_3))
  print("Sum of cluster sizes: ", len(cluster_0)+len(cluster_1)+len(cluster_2)+len(cluster_3))

  cluster_0 = drop_elements_from_cluster(cluster_0, consensus_matrix)
  cluster_1 = drop_elements_from_cluster(cluster_1, consensus_matrix)
  cluster_2 = drop_elements_from_cluster(cluster_2, consensus_matrix)
  cluster_3 = drop_elements_from_cluster(cluster_3, consensus_matrix)
  print("Cluster sizes after dropping:", len(cluster_0), ", ", len(cluster_1), ", ", len(cluster_2), ", ", len(cluster_3),)

  cluster_column_list = convert_indices_to_binary_list(embeddings.shape[0], cluster_0, cluster_1, cluster_2, cluster_3)
  print("Column_lengths: ",  len(cluster_column_list[0]))

  for j in range(0, number_of_clusters):
    if cluster_column_list[j].count(1) > 10:
      print("Cluster size in column form:", cluster_column_list[j].count(1))
      p_value, hazard_ratio = fit_cox_model(cluster_column_list[j], data_frame)
      print("p-value:", p_value)
      print("hazard ratio:", hazard_ratio)
      cluster_name = "k_5_{base}_{j}".format(base=name_embedding, j=j)
      run = wandb.init(project="SURVIVAL_ANALAYSIS_k=4", # Set the project where this run will be logged
                       name=cluster_name
      )

      wandb.log({
                "p_value": p_value,
                "hazard_ratio": hazard_ratio,
                "cluster_size": cluster_column_list[j].count(1)
            })

      if p_value < 0.001:
        file_name = cluster_name + ".txt"
        np.savetxt(file_name, cluster_column_list[j])



Embeddings shape:  (1079, 10)
------------- i = 1 --------------------
[3 1 1 ... 1 1 1]
[[1.   0.   0.   ... 0.   0.   0.05]
 [0.   1.   1.   ... 1.   1.   0.85]
 [0.   1.   1.   ... 1.   1.   0.85]
 ...
 [0.   1.   1.   ... 1.   1.   0.85]
 [0.   1.   1.   ... 1.   1.   0.85]
 [0.05 0.85 0.85 ... 0.85 0.85 1.  ]]
Cluster sizes:  183 ,  223 ,  332 ,  341
Sum of cluster sizes:  1079
Cluster sizes after dropping: 181 ,  145 ,  79 ,  116
Column_lengths:  1079
Cluster size in column form: 181
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        0
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      Na

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-748.62
time fit was run,2024-03-07 19:35:45 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.43,<0.005,24.09
stage_2,0.6,1.82,0.28,0.05,1.15,1.05,3.16,0.0,2.13,0.03,4.91
stage_3,1.34,3.81,0.3,0.76,1.92,2.14,6.81,0.0,4.52,<0.005,17.33
stage_4,2.45,11.64,0.37,1.73,3.18,5.65,23.96,0.0,6.66,<0.005,35.12
cluster,0.25,1.29,0.23,-0.21,0.71,0.81,2.04,0.0,1.08,0.28,1.84

0,1
Concordance,0.76
Partial AIC,1507.25
log-likelihood ratio test,76.68 on 5 df
-log2(p) of ll-ratio test,47.78


[34m[1mwandb[0m: Currently logged in as: [33mcosybio-compsysmed[0m. Use [1m`wandb login --relogin`[0m to force relogin


[5.61409987e-08 3.32138681e-02 6.08511394e-06 2.66919133e-11
 2.78740574e-01]
[1.0377348756585514, 1.8197070605073922, 3.814634860046608, 11.639785101419072, 1.289267250624694]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.023672         0.050408
stage_2           0.047659         1.149692
stage_3           0.758732         1.918958
stage_4           1.732525         3.176333
cluster          -0.205673         0.713821
p-value: 0.2787405736610419
hazard-ratio: 1.289267250624694
p-value: 0.2787405736610419
hazard ratio: 1.289267250624694


Cluster size in column form: 145
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        0
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      NaN      NaN        0
397   43.0  1.0   3262.0      NaN      NaN      NaN        0
459   46.0  1.0    749.0      NaN      NaN      NaN        0
463   90.0  1.0   1542.0      NaN      NaN      NaN        0
470   45.0  1.0   2573.0      NaN      NaN      NaN        1
474   61.0  0.0   7777.0      NaN      NaN      NaN        0
479   57.0  1.0   2373.0      NaN      NaN      NaN        0
482   73.0  1.0   3126.0      NaN      NaN      NaN        0
488   58.0  1.0

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-746.98
time fit was run,2024-03-07 19:35:46 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.45,<0.005,24.28
stage_2,0.63,1.87,0.28,0.07,1.18,1.08,3.25,0.0,2.22,0.03,5.26
stage_3,1.37,3.94,0.3,0.79,1.95,2.2,7.04,0.0,4.63,<0.005,18.04
stage_4,2.4,11.07,0.37,1.68,3.12,5.39,22.73,0.0,6.55,<0.005,33.97
cluster,-0.57,0.56,0.29,-1.15,0.0,0.32,1.0,0.0,-1.95,0.05,4.27

0,1
Concordance,0.76
Partial AIC,1503.97
log-likelihood ratio test,79.96 on 5 df
-log2(p) of ll-ratio test,50.05


[4.91537135e-08 2.61476044e-02 3.71756468e-06 5.94891426e-11
 5.17650595e-02]
[1.0376612990486516, 1.869689434779396, 3.940046081199539, 11.065361482935982, 0.5649574118357005]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.023685         0.050254
stage_2           0.074296         1.177249
stage_3           0.790312         1.952073
stage_4           1.683974         3.123665
cluster          -1.146379         0.004369
p-value: 0.051765059470079124
hazard-ratio: 0.5649574118357005
p-value: 0.051765059470079124
hazard ratio: 0.5649574118357005


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cluster_size,▁
hazard_ratio,▁
p_value,▁

0,1
cluster_size,181.0
hazard_ratio,1.28927
p_value,0.27874


Cluster size in column form: 79
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        0
180   81.0  0.0    608.0      NaN      NaN      NaN        1
213   76.0  0.0   1217.0      NaN      NaN      NaN        1
222   76.0  0.0    304.0      NaN      NaN      NaN        0
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      NaN      NaN        0
397   43.0  1.0   3262.0      NaN      NaN      NaN        0
459   46.0  1.0    749.0      NaN      NaN      NaN        0
463   90.0  1.0   1542.0      NaN      NaN      NaN        0
470   45.0  1.0   2573.0      NaN      NaN      NaN        0
474   61.0  0.0   7777.0      NaN      NaN      NaN        0
479   57.0  1.0   2373.0      NaN      NaN      NaN        0
482   73.0  1.0   3126.0      NaN      NaN      NaN        0
488   58.0  1.0 

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-748.37
time fit was run,2024-03-07 19:35:52 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.36,<0.005,23.55
stage_2,0.61,1.84,0.28,0.06,1.16,1.06,3.2,0.0,2.18,0.03,5.09
stage_3,1.31,3.72,0.3,0.74,1.89,2.09,6.63,0.0,4.45,<0.005,16.85
stage_4,2.44,11.53,0.37,1.72,3.17,5.61,23.69,0.0,6.65,<0.005,35.02
cluster,0.38,1.46,0.28,-0.18,0.94,0.84,2.55,0.0,1.34,0.18,2.46

0,1
Concordance,0.75
Partial AIC,1506.75
log-likelihood ratio test,77.18 on 5 df
-log2(p) of ll-ratio test,48.12


[8.11862472e-08 2.94505108e-02 8.48888458e-06 2.87738855e-11
 1.81811414e-01]
[1.0368479969837543, 1.843660071697506, 3.7201545039294035, 11.529789304849638, 1.4613142208512526]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.022965         0.049406
stage_2           0.061091         1.162415
stage_3           0.735449         1.892081
stage_4           1.724629         3.165239
cluster          -0.177499         0.936171
p-value: 0.18181141416794727
hazard-ratio: 1.4613142208512526
p-value: 0.18181141416794727
hazard ratio: 1.4613142208512526


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cluster_size,▁
hazard_ratio,▁
p_value,▁

0,1
cluster_size,145.0
hazard_ratio,0.56496
p_value,0.05177


Cluster size in column form: 116
Original dataframe shape:  (1079, 6)
       age   OS  OS.time  stage_2  stage_3  stage_4  cluster
0     55.0  0.0   4047.0      NaN      NaN      NaN        1
180   81.0  0.0    608.0      NaN      NaN      NaN        0
213   76.0  0.0   1217.0      NaN      NaN      NaN        0
222   76.0  0.0    304.0      NaN      NaN      NaN        1
223   40.0  0.0    304.0      NaN      NaN      NaN        0
225   69.0  0.0     31.0      NaN      NaN      NaN        0
235   68.0  0.0    579.0      NaN      NaN      NaN        0
397   43.0  1.0   3262.0      NaN      NaN      NaN        0
459   46.0  1.0    749.0      NaN      NaN      NaN        1
463   90.0  1.0   1542.0      NaN      NaN      NaN        1
470   45.0  1.0   2573.0      NaN      NaN      NaN        0
474   61.0  0.0   7777.0      NaN      NaN      NaN        0
479   57.0  1.0   2373.0      NaN      NaN      NaN        0
482   73.0  1.0   3126.0      NaN      NaN      NaN        0
488   58.0  1.0

0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-747.87
time fit was run,2024-03-07 19:35:58 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.53,<0.005,24.92
stage_2,0.59,1.8,0.28,0.04,1.14,1.04,3.12,0.0,2.09,0.04,4.78
stage_3,1.31,3.72,0.3,0.73,1.89,2.08,6.63,0.0,4.44,<0.005,16.79
stage_4,2.43,11.33,0.37,1.71,3.15,5.51,23.28,0.0,6.61,<0.005,34.56
cluster,-0.45,0.64,0.3,-1.03,0.13,0.36,1.14,0.0,-1.53,0.13,2.98

0,1
Concordance,0.75
Partial AIC,1505.75
log-likelihood ratio test,78.18 on 5 df
-log2(p) of ll-ratio test,48.82


[3.14261335e-08 3.64805565e-02 8.83598492e-06 3.94319528e-11
 1.26584883e-01]
[1.0385315843490837, 1.7995442123065928, 3.7160943217021876, 11.329483861939893, 0.635875864109376]
           95% lower-bound  95% upper-bound
covariate                                  
age               0.024416         0.051200
stage_2           0.036958         1.138108
stage_3           0.733719         1.891628
stage_4           1.707231         3.147586
cluster          -1.033607         0.128103
p-value: 0.12658488349699898
hazard-ratio: 0.635875864109376
p-value: 0.12658488349699898
hazard ratio: 0.635875864109376


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cluster_size,▁
hazard_ratio,▁
p_value,▁

0,1
cluster_size,79.0
hazard_ratio,1.46131
p_value,0.18181
