# 1

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv('synthetic_data_lung_cancer.csv')
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SUBJECT_ID,560971.0,478.783759,274.017435,1.0,242.0,471.0,701.0,984.0
TIME,560971.0,4.16022,2.372683,1e-06,2.166878,4.220659,6.15983,16.866712


In [2]:
sample_definition_ids = data['DEFINITION_ID'].drop_duplicates().sample(50, random_state=1)
sample_definition_ids.tolist()

['procedure_235',
 'observation_151',
 'condition_1203',
 'measurement_1024',
 'condition_2200',
 'condition_1057',
 'condition_1870',
 'condition_915',
 'procedure_462',
 'measurement_1220',
 'observation_162',
 'condition_204',
 'drug_379',
 'measurement_1160',
 'measurement_1005',
 'condition_1231',
 'condition_840',
 'measurement_1159',
 'observation_122',
 'measurement_411',
 'condition_744',
 'condition_1462',
 'measurement_90',
 'condition_455',
 'condition_2179',
 'measurement_478',
 'measurement_64',
 'observation_32',
 'observation_7',
 'condition_359',
 'condition_2184',
 'condition_322',
 'procedure_372',
 'condition_2291',
 'procedure_254',
 'measurement_667',
 'condition_1052',
 'procedure_137',
 'measurement_896',
 'drug_156',
 'procedure_404',
 'measurement_1034',
 'measurement_962',
 'condition_1966',
 'condition_623',
 'condition_1885',
 'drug_277',
 'procedure_84',
 'condition_1815',
 'condition_2345']

In [3]:
data['CATEGORY'] = data['DEFINITION_ID'].apply(lambda x: x.split('_')[0])
category_counts = data.groupby(['SUBJECT_ID', 'CATEGORY']).size().unstack(fill_value=0)
category_counts.head()

CATEGORY,condition,death,drug,measurement,observation,procedure
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,261,0,195,33,48,38
2,95,1,45,88,78,57
3,144,0,59,561,69,95
4,30,1,11,385,22,50
6,30,0,8,131,34,26


In [4]:
time_to_event = data.groupby('SUBJECT_ID')['TIME'].max()
cox_data = category_counts.merge(time_to_event, on='SUBJECT_ID')
cox_data['event'] = (cox_data['death'] > 0).astype(int)
cox_data = cox_data.drop(columns=['death'])
cox_data.head()

Unnamed: 0_level_0,condition,drug,measurement,observation,procedure,TIME,event
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,261,195,33,48,38,4.932455,0
2,95,45,88,78,57,2.518113,1
3,144,59,561,69,95,3.994337,0
4,30,11,385,22,50,4.002734,1
6,30,8,131,34,26,5.894177,0


In [5]:
pip install lifelines

Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.model_selection import train_test_split
from lifelines import CoxPHFitter
cox_model = CoxPHFitter()
cox_model.fit(cox_data, duration_col='TIME', event_col='event')
cox_model_summary = cox_model.summary
cox_model.summary

Unnamed: 0_level_0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
condition,0.004508,1.004518,0.001907,0.000771,0.008245,1.000771,1.00828,0.0,2.364195,0.01806929,5.790317
drug,-0.006004,0.994014,0.002804,-0.011499,-0.000509,0.988566,0.999491,0.0,-2.141628,0.03222341,4.955747
measurement,-0.003044,0.996961,0.000541,-0.004104,-0.001983,0.995904,0.998019,0.0,-5.62531,1.851749e-08,25.686536
observation,-0.012518,0.98756,0.002761,-0.017929,-0.007107,0.98223,0.992918,0.0,-4.534143,5.783777e-06,17.399557
procedure,-0.002372,0.997631,0.003959,-0.010132,0.005388,0.989919,1.005403,0.0,-0.59907,0.5491265,0.864789


In [7]:
from lifelines.utils import concordance_index
cox_data['risk_scores'] = cox_model.predict_partial_hazard(cox_data)
c_index = concordance_index(cox_data['TIME'], -cox_data['risk_scores'], cox_data['event'])
print(f"Concordance index: {c_index}")


Concordance index: 0.8473339870164999


In [8]:
cox_data.head() 

Unnamed: 0_level_0,condition,drug,measurement,observation,procedure,TIME,event,risk_scores
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,261,195,33,48,38,4.932455,0,3.982037
2,95,45,88,78,57,2.518113,1,2.575569
3,144,59,561,69,95,3.994337,0,0.715811
4,30,11,385,22,50,4.002734,1,1.955731
6,30,8,131,34,26,5.894177,0,3.930013


In [9]:
cox_data['death'] = cox_data['event']
cox_data = cox_data.drop(columns=['event'])

In [10]:
cox_data

Unnamed: 0_level_0,condition,drug,measurement,observation,procedure,TIME,risk_scores,death
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,261,195,33,48,38,4.932455,3.982037,0
2,95,45,88,78,57,2.518113,2.575569,1
3,144,59,561,69,95,3.994337,0.715811,0
4,30,11,385,22,50,4.002734,1.955731,1
6,30,8,131,34,26,5.894177,3.930013,0
...,...,...,...,...,...,...,...,...
975,13,0,22,6,9,0.288566,7.866851,0
976,11,0,63,14,29,0.273975,5.937313,0
979,2,0,62,1,0,0.094755,7.208420,0
981,4,0,1,6,7,3.149692,8.090989,1
