In [1]:
import pandas as pd
import numpy as np

In [2]:
mimiciv_dir = 'data/mimic-iv-1.0'

In [3]:
# Read in data
cohort = pd.read_csv('../phase1_teamB/final_cohort_with_outcome_labels.csv')
df_patient_info = pd.read_csv(f'{mimiciv_dir}/core/patients.csv', low_memory=False)
df_icds = pd.read_csv(f'{mimiciv_dir}/hosp/diagnoses_icd.csv', low_memory=False)
df_codes = pd.read_csv(f'{mimiciv_dir}/hosp/d_icd_diagnoses.csv', low_memory=False)
elixhauser_table = pd.read_csv(f'data/elixhauser_table.csv', low_memory=False)
# ^ this table uses codes from https://cran.r-project.org/web/packages/comorbidity/vignettes/comorbidityscores.html

In [4]:
# merge to create one table with patient info and hadm info
merged = pd.merge(cohort, df_patient_info, on = ['subject_id'], how='left')

# create table of ICD codes to comorbidity
codes_with_comorbidity = pd.merge(df_codes, elixhauser_table, on = ['icd_code', 'icd_version'], how='left')
codes_with_comorbidity = pd.merge(df_icds, codes_with_comorbidity, on = ['icd_code', 'icd_version'], how='left')

# merge all
merged = pd.merge(merged, codes_with_comorbidity, on = ['subject_id', 'hadm_id'], how='left')
merged.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,...,gender,anchor_age,anchor_year,anchor_year_group,dod,seq_num,icd_code,icd_version,long_title,comorbidity
0,0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,F,73.0,2186.0,2008 - 2010,,16.0,44021,9.0,Atherosclerosis of native arteries of the extr...,Peripheral vascular disorders
1,0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,F,73.0,2186.0,2008 - 2010,,2.0,5854,9.0,"Chronic kidney disease, Stage IV (severe)",Renal failure
2,0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,F,73.0,2186.0,2008 - 2010,,5.0,27800,9.0,"Obesity, unspecified",Obesity
3,0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,F,73.0,2186.0,2008 - 2010,,20.0,V8532,9.0,"Body Mass Index 32.0-32.9, adult",
4,0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,F,73.0,2186.0,2008 - 2010,,1.0,42823,9.0,Acute on chronic systolic heart failure,Congestive heart failure


In [5]:
# create binary variables for gender, ethnicity, and insurance
merged['gender_female'] = np.where(merged['gender']== 'F', 1, 0)
merged['gender_male'] = np.where(merged['gender']== 'M', 1, 0)
merged['eth_amin'] = np.where(merged['ethnicity']== 'AMERICAN INDIAN/ALASKA NATIVE', 1, 0)
merged['eth_asian'] = np.where(merged['ethnicity']== 'ASIAN', 1, 0)
merged['eth_black'] = np.where(merged['ethnicity']== 'BLACK/AFRICAN AMERICAN', 1, 0)
merged['eth_hispanic'] = np.where(merged['ethnicity']== 'HISPANIC/LATINO', 1, 0)
merged['eth_other'] = np.where(merged['ethnicity']== 'OTHER', 1, 0)
merged['eth_white'] = np.where(merged['ethnicity']== 'WHITE', 1, 0)
merged['eth_unknown'] = np.where(merged['ethnicity']== 'UNKNOWN', 1, 0)
merged['insurance_medicaid'] = np.where(merged['insurance']== 'Medicaid', 1, 0)
merged['insurance_medicare'] = np.where(merged['insurance']== 'Medicare', 1, 0)
merged['insurance_other'] = np.where(merged['insurance']== 'Other', 1, 0)

In [6]:
# drop columns
merged = merged.drop(columns=['Unnamed: 0', 'deathtime', 'admission_type', 'admission_location',
                              'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity',
                              'edregtime', 'edouttime', 'hospital_expire_flag', 'chronic_dialysis', 
                              'heart_failure', 'readmission_num', '48h_hf', '14d_hf', '30d_hf', 'er_hf', '48h', 
                              '14d', '30d', 'er', 'gender', 'anchor_year', 'anchor_year_group', 'dod', 'seq_num',
                              'icd_code', 'icd_version', 'long_title'])
merged.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,anchor_age,comorbidity,gender_female,gender_male,eth_amin,eth_asian,eth_black,eth_hispanic,eth_other,eth_white,eth_unknown,insurance_medicaid,insurance_medicare,insurance_other
0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,Peripheral vascular disorders,1,0,0,0,1,0,0,0,0,0,1,0
1,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,Renal failure,1,0,0,0,1,0,0,0,0,0,1,0
2,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,Obesity,1,0,0,0,1,0,0,0,0,0,1,0
3,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,,1,0,0,0,1,0,0,0,0,0,1,0
4,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,Congestive heart failure,1,0,0,0,1,0,0,0,0,0,1,0


In [7]:
# comorbidity calculations
comorbidity_counts = [] # basic calculation, count of number of comorbidities
ahrq_scores = [] # using ahrq weighting
vanWalraven_scores = [] # using van Walraven weighting
hadms = merged.hadm_id.unique()
# for each hadm_id, calculate the comorbidity score
for hadm in hadms:
    ahrq_score = 0
    vanWalraven_score = 0
    comorbidities = set(merged[merged['hadm_id'] == hadm].comorbidity.unique())
    comorbidity_counts.append(len(comorbidities) - 1) # remove 1 since nan will be counted
    if 'Congestive heart failure' in comorbidities:
        ahrq_score += 9
        vanWalraven_score += 7
    if 'Cardiac arrhythmias' in comorbidities:
        ahrq_score += 0
        vanWalraven_score += 5
    if 'Valvular disease' in comorbidities:
        ahrq_score += 0
        vanWalraven_score += -1
    if 'Pulmonary circulation disorders' in comorbidities:
        ahrq_score += 6
        vanWalraven_score += 4
    if 'Peripheral vascular disorders' in comorbidities:
        ahrq_score += 3
        vanWalraven_score += 2
    if 'Hypertension, uncomplicated' in comorbidities or 'Hypertension, complicated' in comorbidities:
        ahrq_score += -1
        vanWalraven_score += 0
    if 'Paralysis' in comorbidities: 
        ahrq_score += 5
        vanWalraven_score += 7
    if 'Other neurological disorders' in comorbidities: 
        ahrq_score += 5
        vanWalraven_score += 6
    if 'Chronic pulmonary disease' in comorbidities: 
        ahrq_score += 3
        vanWalraven_score += 3
    if 'Diabetes, uncomplicated' in comorbidities: 
        ahrq_score += 0
        vanWalraven_score += 0
    if 'Diabetes, complicated' in comorbidities: 
        ahrq_score += -3
        vanWalraven_score += 0
    if 'Hypothyroidism' in comorbidities: 
        ahrq_score += 0
        vanWalraven_score += 0
    if 'Renal failure' in comorbidities: 
        ahrq_score += 6
        vanWalraven_score += 5
    if 'Liver disease' in comorbidities: 
        ahrq_score += 4
        vanWalraven_score += 11
    if 'Peptic ulcer disease, excluding bleeding' in comorbidities: 
        ahrq_score += 0
        vanWalraven_score += 0
    if 'AIDS/HIV' in comorbidities: 
        ahrq_score += 0
        vanWalraven_score += 0
    if 'Lymphoma' in comorbidities: 
        ahrq_score += 6
        vanWalraven_score += 9 
    if 'Metastatic cancer' in comorbidities: 
        ahrq_score += 14
        vanWalraven_score += 12
    if 'Solid tumour without metastasis' in comorbidities: 
        ahrq_score += 7
        vanWalraven_score += 4
    if 'Rheumatoid arthritis/collagen vascular diseases' in comorbidities: 
        ahrq_score += 0
        vanWalraven_score += 0
    if 'Coagulopathy' in comorbidities: 
        ahrq_score += 11
        vanWalraven_score += 3
    if 'Obesity' in comorbidities: 
        ahrq_score += -5
        vanWalraven_score += -4
    if 'Weight loss' in comorbidities: 
        ahrq_score += 9
        vanWalraven_score += 6
    if 'Fluid and electrolyte disorders' in comorbidities: 
        ahrq_score += 11
        vanWalraven_score += 5
    if 'Blood loss anaemia' in comorbidities: 
        ahrq_score += -3
        vanWalraven_score += -2
    if 'Deficiency anaemia' in comorbidities: 
        ahrq_score += -2
        vanWalraven_score += -2
    if 'Alcohol abuse' in comorbidities: 
        ahrq_score += -1
        vanWalraven_score += 0
    if 'Drug abuse' in comorbidities: 
        ahrq_score += -7
        vanWalraven_score += -7
    if 'Psychoses' in comorbidities: 
        ahrq_score += -5
        vanWalraven_score += 0
    if 'Depression' in comorbidities: 
        ahrq_score += -5
        vanWalraven_score += -3
    ahrq_scores.append(ahrq_score)
    vanWalraven_scores.append(vanWalraven_score)

# create df of hadms to the 3 measures of Elixhauser comorbidity index
df_eci = pd.DataFrame({'hadm_id': hadms, 'eci_count': comorbidity_counts, 'ahrq_score': ahrq_scores, 'vanWalraven_score': vanWalraven_scores})
    

In [8]:
# merge to include eci measures
merged = pd.merge(merged, df_eci, on = ['hadm_id'], how='left')

# drop comorbidity and subject_id columns
merged = merged.drop(columns=['comorbidity', 'subject_id'])

# drop duplicates of subject_id and hadm_id
demographic_comorbidity_features = merged.drop_duplicates(subset=['hadm_id'])
demographic_comorbidity_features.head()

Unnamed: 0,hadm_id,admittime,dischtime,anchor_age,gender_female,gender_male,eth_amin,eth_asian,eth_black,eth_hispanic,eth_other,eth_white,eth_unknown,insurance_medicaid,insurance_medicare,insurance_other,eci_count,ahrq_score,vanWalraven_score
0,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,73.0,1,0,0,0,1,0,0,0,0,0,1,0,6,12,10
21,25911675,2191-05-23 15:33:00,2191-05-24 17:14:00,73.0,1,0,0,0,1,0,0,0,0,0,1,0,6,12,12
43,29659838,2191-07-16 14:21:00,2191-07-19 13:03:00,73.0,1,0,0,0,1,0,0,0,0,0,1,0,4,12,11
59,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,73.0,1,0,0,0,1,0,0,0,0,0,0,1,6,17,13
81,24065018,2128-03-17 14:53:00,2128-03-19 16:25:00,87.0,1,0,0,0,0,0,0,1,0,0,1,0,5,28,23


In [9]:
# save as csv file
demographic_comorbidity_features.to_csv('demographic_comorbidity_features.csv')