In [None]:
!pip install lifelines
!pip install pyckmeans
!pip install wandb -qU

Collecting lifelines
  Downloading lifelines-0.28.0-py3-none-any.whl (349 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/349.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/349.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.1-py3-none-any.whl (94 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autog

In [None]:
import numpy as np
import pandas as pd
from lifelines.fitters.coxph_fitter import CoxPHFitter
from pyckmeans import CKmeans
import matplotlib.pyplot as plt

from google.colab import drive

import os
import wandb

In [None]:
# --- Load raw BRCA data ---
drive.mount('/content/gdrive')
!unzip gdrive/MyDrive/TCGA-BRCA_1079.zip

Mounted at /content/gdrive
Archive:  gdrive/MyDrive/TCGA-BRCA_1079.zip
  inflating: TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv  


In [None]:
data_frame = pd.read_csv("TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv", sep="\t")
print("Original dataframe shape:", data_frame.shape)
data_frame = data_frame[['age', 'OS', 'OS.time', 'stage_2', 'stage_3', 'stage_4']]

cluster_column = np.loadtxt("VAE_Z_EM_Mean_4_0.txt")
print("Cluster column shape:", cluster_column.shape)
cluster_frame = pd.DataFrame({'cluster': cluster_column})


# Concatenate cluster membership to data table
if 'cluster' in data_frame.columns.values:
  data_frame = data_frame.drop('cluster', axis = 1) # Drop columns that could have been created previously
data_frame = pd.concat([data_frame, cluster_frame], axis = 1)

# Get rows with null entry
null_mask = data_frame.isnull().any(axis=1)
null_rows = data_frame[null_mask]

# Remove rows with null entry
print(data_frame.shape)
data_frame.dropna(inplace=True) # remove all rows with any null value
print(data_frame.shape)

# Fit Cox model
cph = CoxPHFitter()
cph.fit(data_frame, duration_col = 'OS.time', event_col = 'OS')
cph.print_summary()

p_values = cph._compute_p_values()

print("p-value cluster: ", p_values[4])

Original dataframe shape: (1079, 18)
Cluster column shape: (1079,)
(1079, 7)
(1054, 7)


0,1
model,lifelines.CoxPHFitter
duration col,'OS.time'
event col,'OS'
baseline estimation,breslow
number of observations,1054
number of events observed,139
partial log-likelihood,-745.33
time fit was run,2024-03-15 10:40:47 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age,0.04,1.04,0.01,0.02,0.05,1.02,1.05,0.0,5.4,<0.005,23.87
stage_2,0.59,1.81,0.28,0.04,1.15,1.04,3.14,0.0,2.12,0.03,4.86
stage_3,1.35,3.86,0.3,0.77,1.93,2.16,6.9,0.0,4.57,<0.005,17.62
stage_4,2.46,11.7,0.37,1.74,3.18,5.69,24.04,0.0,6.69,<0.005,35.39
cluster,1.3,3.68,0.4,0.53,2.08,1.69,7.98,0.0,3.29,<0.005,9.98

0,1
Concordance,0.76
Partial AIC,1500.66
log-likelihood ratio test,83.26 on 5 df
-log2(p) of ll-ratio test,52.35


p-value cluster:  0.0009905756738502187
Cluster index:  1


In [None]:
num_lumA = 0
num_lumB = 0
num_basal = 0
num_her2 = 0
num_normal = 0

list_of_indices = []
for j in range(0, len(list(cluster_column))):
  if cluster_column[j] == 1:
    list_of_indices.append(j)
print("List of indices:", list_of_indices)
print("Size of subgroup:", len(list_of_indices))

subtype_data_frame = pd.read_csv("TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv", sep="\t")
#subtype_data_frame.head()
subtype_labels = subtype_data_frame.values[:,1]
print(subtype_labels)

for k in range(0, len(list_of_indices)):
  if subtype_labels[list_of_indices[k]] == 'LumA':
    num_lumA = num_lumA + 1
  elif subtype_labels[list_of_indices[k]] == 'LumB':
    num_lumB = num_lumB + 1
  elif subtype_labels[list_of_indices[k]] == 'Basal':
    num_basal = num_basal + 1
  elif subtype_labels[list_of_indices[k]] == 'Her2':
    num_her2 = num_her2 + 1
  elif subtype_labels[list_of_indices[k]] == 'Normal':
    num_normal = num_normal + 1

print("---------")
print("LumA: ", num_lumA)
print("LumB: ", num_lumB)
print("Basal: ", num_basal)
print("Her2: ", num_her2)
print("Normal: ", num_normal)
sum = num_lumA + num_lumB + num_basal + num_her2 + num_normal
print("Summe: ", sum)

List of indices: [8, 22, 113, 115, 133, 138, 139, 141, 162, 206, 212, 262, 300, 313, 374, 385, 510, 599, 763, 767, 771, 857, 877, 879, 913, 979, 982, 993, 1014, 1018, 1019, 1020, 1024, 1030, 1069, 1070]
Size of subgroup: 36
['LumB' 'Her2' 'LumB' ... 'LumA' 'LumA' 'LumB']
---------
LumA:  0
LumB:  2
Basal:  24
Her2:  7
Normal:  3
Summe:  36
