In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import seaborn as sn
from lifelines.fitters.kaplan_meier_fitter import KaplanMeierFitter

pd.set_option('display.max_columns', None)


In [None]:
df = pd.read_csv("brca_metabric/brca_metabric_clinical_data.tsv", sep="\t")
df.head()

## Prepare Dataframe

In [None]:
df["Censorship"] = df["Patient's Vital Status"] == "Living"
df.head()

Unnamed: 0,Study ID,Patient ID,Sample ID,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,Cohort,ER status measured by IHC,ER Status,Neoplasm Histologic Grade,HER2 status measured by SNP6,HER2 Status,Tumor Other Histologic Subtype,Hormone Therapy,Inferred Menopausal State,Integrative Cluster,Primary Tumor Laterality,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Oncotree Code,Overall Survival (Months),Overall Survival Status,PR Status,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Number of Samples Per Patient,Sample Type,Sex,3-Gene classifier subtype,TMB (nonsynonymous),Tumor Size,Tumor Stage,Patient's Vital Status,Censorship
0,brca_metabric,MB-0000,MB-0000,75.65,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,,NO,claudin-low,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Post,4ER+,Right,10.0,,6.044,IDC,140.5,0:LIVING,Negative,YES,138.65,0:Not Recurred,1,Primary,Female,ER-/HER2-,0.0,22.0,2.0,Living,True
1,brca_metabric,MB-0002,MB-0002,43.19,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,NO,LumA,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,4ER+,Right,0.0,2.0,4.02,IDC,84.633333,0:LIVING,Positive,YES,83.52,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,10.0,1.0,Living,True
2,brca_metabric,MB-0005,MB-0005,48.87,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,YES,LumB,1.0,Positve,Positive,2.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,3,Right,1.0,2.0,4.03,IDC,163.7,1:DECEASED,Positive,NO,151.28,1:Recurred,1,Primary,Female,,2.615035,15.0,2.0,Died of Disease,False
3,brca_metabric,MB-0006,MB-0006,47.68,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,YES,LumB,1.0,Positve,Positive,2.0,NEUTRAL,Negative,Mixed,YES,Pre,9,Right,3.0,1.0,4.05,MDLC,164.933333,0:LIVING,Positive,YES,162.76,0:Not Recurred,1,Primary,Female,,1.307518,25.0,2.0,Living,True
4,brca_metabric,MB-0008,MB-0008,76.97,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,YES,LumB,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Mixed,YES,Post,9,Right,8.0,2.0,6.08,MDLC,41.366667,1:DECEASED,Positive,YES,18.55,1:Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,40.0,2.0,Died of Disease,False


## Test set

In [None]:
# drop rows with nan values
df_valid = df.dropna()
# split into test and train set 
df_train = df_valid.sample(frac=0.8, random_state=0)
df_test = df_valid.drop(df_train.index)

In [None]:
y_train = np.array(list(zip(df_train["Censorship"].values, df_train["Overall Survival (Months)"].values)), dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
X_train = df_train.drop(["Study ID", "Patient ID", "Sample ID","Overall Survival Status", "Patient's Vital Status", "Cancer Type", "Number of Samples Per Patient", "Sex", "Sample Type"
, "Cancer Type Detailed", "HER2 Status", "HER2 status measured by SNP6", "Tumor Other Histologic Subtype", "Oncotree Code", "Relapse Free Status", "Relapse Free Status (Months)"], axis = 1)
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,Age at Diagnosis,Cohort,Neoplasm Histologic Grade,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Overall Survival (Months),TMB (nonsynonymous),Tumor Size,Tumor Stage,Censorship,Type of Breast Surgery_BREAST CONSERVING,Type of Breast Surgery_MASTECTOMY,Cellularity_High,Cellularity_Low,Cellularity_Moderate,Chemotherapy_NO,Chemotherapy_YES,Pam50 + Claudin-low subtype_Basal,Pam50 + Claudin-low subtype_Her2,Pam50 + Claudin-low subtype_LumA,Pam50 + Claudin-low subtype_LumB,Pam50 + Claudin-low subtype_Normal,Pam50 + Claudin-low subtype_claudin-low,ER status measured by IHC_Negative,ER status measured by IHC_Positve,ER Status_Negative,ER Status_Positive,Hormone Therapy_NO,Hormone Therapy_YES,Inferred Menopausal State_Post,Inferred Menopausal State_Pre,Integrative Cluster_1,Integrative Cluster_10,Integrative Cluster_2,Integrative Cluster_3,Integrative Cluster_4ER+,Integrative Cluster_4ER-,Integrative Cluster_5,Integrative Cluster_6,Integrative Cluster_7,Integrative Cluster_8,Integrative Cluster_9,Primary Tumor Laterality_Left,Primary Tumor Laterality_Right,PR Status_Negative,PR Status_Positive,Radio Therapy_NO,Radio Therapy_YES,3-Gene classifier subtype_ER+/HER2- High Prolif,3-Gene classifier subtype_ER+/HER2- Low Prolif,3-Gene classifier subtype_ER-/HER2-,3-Gene classifier subtype_HER2+
1184,70.59,3.0,3.0,0.0,5.0,4.05,221.6,6.537589,25.0,2.0,False,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0
1135,76.37,3.0,2.0,0.0,7.0,3.04,221.766667,9.152624,20.0,1.0,False,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0
1141,75.27,3.0,3.0,6.0,11.0,6.064,65.566667,13.075177,32.0,2.0,False,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0
1462,59.02,3.0,2.0,0.0,5.0,3.07,110.466667,6.537589,35.0,2.0,True,0,1,1,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0
710,50.21,2.0,1.0,0.0,9.0,2.036,123.9,11.767659,18.0,1.0,True,1,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0


In [None]:
from lifelines import CoxPHFitter
cph = CoxPHFitter(penalizer=0.001)
cph.fit(X_train, duration_col = "Overall Survival (Months)", event_col = "Censorship")
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'Overall Survival (Months)'
event col,'Censorship'
penalizer,0.001
l1 ratio,0.0
baseline estimation,breslow
number of observations,874
number of events observed,379
partial log-likelihood,-1923.08
time fit was run,2022-12-24 08:46:20 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Age at Diagnosis,-0.0,1.0,0.01,-0.02,0.01,0.98,1.01,0.0,-0.28,0.78,0.36
Cohort,-0.63,0.53,0.09,-0.8,-0.47,0.45,0.63,0.0,-7.4,<0.005,42.69
Neoplasm Histologic Grade,0.52,1.68,0.18,0.17,0.87,1.19,2.39,0.0,2.93,<0.005,8.19
Lymph nodes examined positive,0.05,1.05,0.03,-0.01,0.1,0.99,1.11,0.0,1.77,0.08,3.72
Mutation Count,-0.07,0.94,0.17,-0.41,0.28,0.67,1.32,0.0,-0.38,0.71,0.5
Nottingham prognostic index,-0.43,0.65,0.15,-0.73,-0.12,0.48,0.88,0.0,-2.77,0.01,7.46
TMB (nonsynonymous),0.01,1.02,0.13,-0.25,0.28,0.78,1.32,0.0,0.11,0.91,0.13
Tumor Size,-0.0,1.0,0.01,-0.01,0.01,0.99,1.01,0.0,-0.21,0.84,0.26
Tumor Stage,0.23,1.26,0.14,-0.04,0.5,0.96,1.65,0.0,1.69,0.09,3.47
Type of Breast Surgery_BREAST CONSERVING,0.06,1.06,1.53,-2.95,3.06,0.05,21.43,0.0,0.04,0.97,0.04

0,1
Concordance,0.72
Partial AIC,3948.17
log-likelihood ratio test,196.47 on 51 df
-log2(p) of ll-ratio test,60.52
