In [None]:
# Install packages

# pip install comorbidipy
# pip install icd-mappings

In [44]:
# Import packages and define data

import pandas as pd
import numpy as np
from comorbidipy import comorbidity
from icdmappings import Mapper

In [45]:
# Read in existing training data

data = pd.read_csv('../data/MIMIC-ED/cox_timevarying_train_v2.csv')
print(data.shape)
data.head()

(22714, 32)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,arrival_transport_UNKNOWN,arrival_transport_WALK IN,lactate,wbc,time_since_adm,gsn_16599.0,gsn_43952.0,gsn_4490.0,gsn_66419.0,gsn_61716.0
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,False,1.6,7.8,0.0,1,0,0,0,0
1,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,False,1.6,7.8,0.066667,0,0,0,1,0
2,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,False,False,1.6,7.8,0.15,0,0,0,1,0
3,30005196,0.4,0.6,0,97.8,84.0,28.0,95.0,122.0,60.0,...,False,False,1.6,7.8,0.4,0,0,0,1,0
4,30005196,0.6,0.933333,0,97.9,83.0,22.0,95.0,107.0,52.0,...,False,False,1.4,7.8,0.6,0,0,0,1,0


In [46]:
# Read in diagnosis data

diagnoses = pd.read_csv('../data/MIMIC-ED/ed/diagnosis.csv')
print(diagnoses.shape)
diagnoses.head()

(899050, 6)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10000032,32952584,1,4589,9,HYPOTENSION NOS
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS"
4,10000032,33258284,2,78959,9,OTHER ASCITES


In [47]:
# Map diagnosis data ICD codes so everything is ICD 10

mapper = Mapper()
def icd_map(val):
    return mapper.map(val, source='icd9', target='icd10')

diagnoses['icd_10'] = np.where(
    diagnoses['icd_version'] == 10,
    diagnoses['icd_code'],
    diagnoses['icd_code'].apply(icd_map)
)

print(diagnoses.shape)
diagnoses.head()

(899050, 7)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title,icd_10
0,10000032,32952584,1,4589,9,HYPOTENSION NOS,I959
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...,B1920
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION,Z21
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS",K7290
4,10000032,33258284,2,78959,9,OTHER ASCITES,R188


In [48]:
# Add diagnosis information to original training data

data_with_diagnoses = pd.merge(left=data, right=diagnoses[['stay_id','icd_10']], how='left', on='stay_id')
print(data_with_diagnoses.shape)
data_with_diagnoses.head()

(64579, 33)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,arrival_transport_WALK IN,lactate,wbc,time_since_adm,gsn_16599.0,gsn_43952.0,gsn_4490.0,gsn_66419.0,gsn_61716.0,icd_10
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,1.6,7.8,0.0,1,0,0,0,0,A419
1,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,1.6,7.8,0.0,1,0,0,0,0,N390
2,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,1.6,7.8,0.066667,0,0,0,1,0,A419
3,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,False,1.6,7.8,0.066667,0,0,0,1,0,N390
4,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,False,1.6,7.8,0.15,0,0,0,1,0,A419


In [49]:
# Create comorbidity dataframe for each stay

cmb_df = comorbidity(data_with_diagnoses[['stay_id', 'icd_10']],
                     id='stay_id',
                     code='icd_10',
                     age=None,
                     score='charlson',
                     icd='icd10')

cmb_cols = ['aids','ami','canc','cevd','chf','copd','dementia','diab','diabwc','hp','metacanc','mld','msld','pud','pvd','rend','rheumd','comorbidity_score']

cmb_df.head()

Unnamed: 0,stay_id,aids,ami,canc,cevd,chf,copd,dementia,diab,diabwc,hp,metacanc,mld,msld,pud,pvd,rend,rheumd,comorbidity_score
0,30005196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30008310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30009370,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,30010477,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,30013802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Add comorbidity information to original training data

data_with_cmb = pd.merge(left=data, right=cmb_df, how='left', on='stay_id')
data_with_cmb[cmb_cols] = data_with_cmb[cmb_cols].fillna(value=0.0)

print(data_with_cmb.shape)
data_with_cmb.head()

(22714, 50)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,diabwc,hp,metacanc,mld,msld,pud,pvd,rend,rheumd,comorbidity_score
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30005196,0.4,0.6,0,97.8,84.0,28.0,95.0,122.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30005196,0.6,0.933333,0,97.9,83.0,22.0,95.0,107.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Ensure starting number of rows for training data matches ending number of rows

assert data.shape[0] == data_with_cmb.shape[0], 'Number of rows does not match between training data coming in and training data coming out'

In [53]:
# Write new training data to csv

data_with_cmb.to_csv('../data/MIMIC-ED/cox_timevarying_train_v3.csv', index=False)