In [None]:
# Install packages

# pip install comorbidipy
# pip install icd-mappings

In [43]:
# Import packages and define data

import pandas as pd
import numpy as np
from comorbidipy import comorbidity
from icdmappings import Mapper

In [44]:
# Read in existing training data

data = pd.read_csv('../data/MIMIC-ED/cox_timevarying_with_all_labs_train_v2.csv')
print(data.shape)
data.head()

(22714, 41)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,gsn_61716.0,lab_Creatinine,lab_Platelet Count,lab_Absolute Neutrophil Count,lab_C-Reactive Protein,lab_INR(PT),lab_PTT,lab_Fibrinogen,"lab_Bilirubin, Total",lab_D-Dimer
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,
1,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,
2,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,0,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,
3,30005196,0.4,0.6,0,97.8,84.0,28.0,95.0,122.0,60.0,...,0,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,
4,30005196,0.6,0.933333,0,97.9,83.0,22.0,95.0,107.0,52.0,...,0,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,


In [45]:
# Read in diagnosis data

diagnoses = pd.read_csv('../data/MIMIC-ED/ed/diagnosis.csv')
print(diagnoses.shape)
diagnoses.head()

(899050, 6)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10000032,32952584,1,4589,9,HYPOTENSION NOS
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS"
4,10000032,33258284,2,78959,9,OTHER ASCITES


In [46]:
# Map diagnosis data ICD codes so everything is ICD 10

mapper = Mapper()
def icd_map(val):
    return mapper.map(val, source='icd9', target='icd10')   # ICD 9 to ICD 10

diagnoses['icd_10'] = np.where(
    diagnoses['icd_version'] == 10,
    diagnoses['icd_code'],
    diagnoses['icd_code'].apply(icd_map)
)

print(diagnoses.shape)
diagnoses.head()

(899050, 7)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title,icd_10
0,10000032,32952584,1,4589,9,HYPOTENSION NOS,I959
1,10000032,32952584,2,07070,9,UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...,B1920
2,10000032,32952584,3,V08,9,ASYMPTOMATIC HIV INFECTION,Z21
3,10000032,33258284,1,5728,9,"OTH SEQUELA, CHR LIV DIS",K7290
4,10000032,33258284,2,78959,9,OTHER ASCITES,R188


In [47]:
# Add diagnosis information to original training data

data_with_diagnoses = pd.merge(left=data, right=diagnoses[['stay_id','icd_10']], how='left', on='stay_id')
print(data_with_diagnoses.shape)
data_with_diagnoses.head()

(64579, 42)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,lab_Creatinine,lab_Platelet Count,lab_Absolute Neutrophil Count,lab_C-Reactive Protein,lab_INR(PT),lab_PTT,lab_Fibrinogen,"lab_Bilirubin, Total",lab_D-Dimer,icd_10
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,,A419
1,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,,N390
2,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,,A419
3,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,,N390
4,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,1.454545,230.269231,6.096364,,1.093333,33.36,,1.75,,A419


In [48]:
# Create comorbidity dataframe for each stay

cmb_df = comorbidity(data_with_diagnoses[['stay_id', 'icd_10']],
                     id='stay_id',
                     code='icd_10',
                     age=None,
                     score='charlson',
                     icd='icd10')

cmb_cols = ['aids','ami','canc','cevd','chf','copd','dementia','diab','diabwc','hp','metacanc','mld','msld','pud','pvd','rend','rheumd','comorbidity_score']

print(cmb_df.shape)
cmb_df.head()

(4566, 19)


Unnamed: 0,stay_id,aids,ami,canc,cevd,chf,copd,dementia,diab,diabwc,hp,metacanc,mld,msld,pud,pvd,rend,rheumd,comorbidity_score
0,30005196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30008310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30009370,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,30010477,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,30013802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Add comorbidity information to original training data

data_with_cmb = pd.merge(left=data, right=cmb_df, how='left', on='stay_id')
data_with_cmb[cmb_cols] = data_with_cmb[cmb_cols].fillna(value=0.0)

print(data_with_cmb.shape)
data_with_cmb.head()

(22714, 59)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,diabwc,hp,metacanc,mld,msld,pud,pvd,rend,rheumd,comorbidity_score
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30005196,0.4,0.6,0,97.8,84.0,28.0,95.0,122.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30005196,0.6,0.933333,0,97.9,83.0,22.0,95.0,107.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Adding ClinicalBERT data

cbert_data = pd.read_parquet('../data/MIMIC-ED/chiefcomplaint_with_embeddings.parquet')
cbert_data = cbert_data[['stay_id','clinicalbert_emb']]
num_features = len(cbert_data['clinicalbert_emb'][0])
empty_array = np.zeros(num_features)

# Keeping only one entry per stay_id and cbert pair
cbert_data_grouped = cbert_data.groupby('stay_id').first().reset_index()

# Left join to existing data and fill na values
data_with_cmb_cbert = data_with_cmb.merge(cbert_data_grouped, how='left', on='stay_id')
data_with_cmb_cbert["clinicalbert_emb"] = (
    data_with_cmb_cbert["clinicalbert_emb"]
    .apply(
        lambda x: empty_array.copy()
        if (
            x is None
            or (isinstance(x, float) and pd.isna(x))
            or (isinstance(x, np.ndarray) and pd.isna(x).any())
        )
        else x
    )
)

print(data_with_cmb_cbert.shape)
data_with_cmb_cbert.head()

(22714, 60)


Unnamed: 0,stay_id,start,stop,event,temperature,heartrate,resprate,o2sat,sbp,dbp,...,hp,metacanc,mld,msld,pud,pvd,rend,rheumd,comorbidity_score,clinicalbert_emb
0,30005196,0.0,0.066667,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,30005196,0.066667,0.15,0,97.8,86.0,16.0,97.0,130.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,30005196,0.15,0.4,0,97.8,85.0,29.0,95.0,109.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,30005196,0.4,0.6,0,97.8,84.0,28.0,95.0,122.0,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,30005196,0.6,0.933333,0,97.9,83.0,22.0,95.0,107.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [81]:
# Ensure starting number of rows for training data matches ending number of rows

assert data.shape[0] == data_with_cmb_cbert.shape[0], 'Number of rows does not match between training data coming in and training data coming out'

In [82]:
# Write new training data to parquet

data_with_cmb_cbert.to_parquet('../data/MIMIC-ED/cox_timevarying_with_labs_cmorbid_cbert.parquet', index=False)