In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
mimiciv_dir = 'C:/Users/suzie/Dropbox (MIT)/Spring 2021/6.871 Machine Learning for Healthcare/mimic-iv-1.0/hosp'

In [3]:
mimiciv_dir = 'data'

## Part I: read data

In [4]:
df_event = pd.read_csv(f'{mimiciv_dir}/diagnoses_icd.csv', low_memory=False)

In [5]:
df_code = pd.read_csv(f'{mimiciv_dir}/d_icd_diagnoses.csv', low_memory=False)

In [6]:

#merge tables:
df_sub = pd.merge(df_event, df_code, on = ['icd_code','icd_version'], how='left')

#convert to lower case:
df_sub['long_title'] = df_sub['long_title'].str.lower()

df_sub = df_sub.dropna(subset=['icd_code','icd_version','hadm_id','subject_id','long_title'], how='any')

df_sub.head(5)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,15734973,20475282,3,2825,9,sickle-cell trait
1,15734973,20475282,2,V0251,9,carrier or suspected carrier of group b strept...
2,15734973,20475282,5,V270,9,"outcome of delivery, single liveborn"
3,15734973,20475282,1,64891,9,other current conditions classifiable elsewher...
4,15734973,20475282,4,66481,9,"other specified trauma to perineum and vulva, ..."


## Part II: ICD Code Cohorting

In [7]:
#Filter to heart failure: 
#Filter 1: long title contains "heart failure" AND long title doesnt contain without heart failure
# this paper: https://academic.oup.com/aje/article/183/5/462/2462157?login=true used almost all of these for ADHF
df_f1 = df_sub[df_sub['long_title'].str.contains('heart failure')]
df_f1 = df_f1[~df_f1['long_title'].str.contains('without heart failure')]
# df_f1.head(5)

In [8]:
df_f1_hadm = df_f1[['subject_id','hadm_id','long_title', 'icd_code','icd_version']].drop_duplicates(subset=['hadm_id'])
print(np.shape(df_f1_hadm)[0])
n1 = np.shape(df_f1_hadm)[0]
# df_f1_hadm.head(5)

64686


In [9]:
# Filter 2: Filter 1 + exclude "XX disease with heart failure" format (only keep HF as main target in title)
df_f2 = df_f1[~df_f1['long_title'].str.contains('disease')]
df_f2_hadm = df_f2[['subject_id','hadm_id','long_title', 'icd_code','icd_version']].drop_duplicates(subset=['hadm_id'])
print(np.shape(df_f2_hadm)[0])
hf2_icd = df_f2[['long_title', 'icd_code','icd_version']].drop_duplicates(subset=['icd_code']) #.sort_values('icd_code')
hf2_icd
n2 = np.shape(df_f2_hadm)[0]

64566


In [10]:
# Filter 5: Filter 1+2+ Use I50+ and 428+
# source: https://icd.codes/icd10cm/I503 "The ICD code I50 is used to code Acute decompensated heart failure"
df_f5 = df_f2[df_f2['icd_code'].str.contains('428') | df_f2['icd_code'].str.contains('I50')]
df_f5_hadm = df_f5[['subject_id','hadm_id','long_title', 'icd_code','icd_version']].drop_duplicates(subset=['hadm_id'])
n5=(np.shape(df_f5_hadm)[0])
n5

64409

In [11]:
# Filter 6: Filter 1+2+5+ exclude acute only, exclude chronic only, keep acute on chronic or specified
# source: NA_CARE_Prov_ICD_10_CM_Documentation_and_Coding_Best_Practices_Heart_Failure_v3_ENG_2019 pdf
df_f6 = df_f5[~df_f5['long_title'].str.contains('acute')  | df_f5['long_title'].str.contains('acute on chronic') ]
df_f6 = df_f6[~df_f6['long_title'].str.contains('chronic')  | df_f6['long_title'].str.contains('acute on chronic') ]
df_f6_hadm = df_f6[['subject_id','hadm_id','long_title', 'icd_code','icd_version']].drop_duplicates(subset=['hadm_id'])
n6 = (np.shape(df_f6_hadm)[0])
n6

51330

In [12]:
# Filter 7: Filter 1+2+5+6 exclude end stage
# It is important to distinguish between AdHF and end-stage HF
# source https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6963179/
df_f7 = df_f6[~df_f6['long_title'].str.contains('end stage')]
df_f7_hadm = df_f7[['subject_id','hadm_id','long_title', 'icd_code','icd_version']].drop_duplicates(subset=['hadm_id'])
n7 = (np.shape(df_f7_hadm)[0])
n7

51303

In [13]:
df_f7_hadm

Unnamed: 0,subject_id,hadm_id,long_title,icd_code,icd_version
1161,16477997,25359811,acute on chronic systolic heart failure,42823,9
1223,14959277,28740655,"congestive heart failure, unspecified",4280,9
1435,18894258,28336555,"congestive heart failure, unspecified",4280,9
1662,14974219,20480369,"congestive heart failure, unspecified",4280,9
1677,12849817,27891695,"congestive heart failure, unspecified",4280,9
1787,11508679,25781770,"congestive heart failure, unspecified",4280,9
1840,18151621,25727457,"congestive heart failure, unspecified",4280,9
2520,12711129,26466704,acute on chronic diastolic heart failure,42833,9
2622,18982551,28769191,"congestive heart failure, unspecified",4280,9
2716,13427502,20353250,"congestive heart failure, unspecified",4280,9


## Part III: DRG codes

In [14]:
drgcodes = pd.read_csv(f'{mimiciv_dir}/drgcodes.csv', low_memory=False)

In [15]:
# merge tables
df_ids_icds = pd.merge(df_event, df_code, on = ['icd_code','icd_version'], how='left')
df_merged = pd.merge(df_ids_icds, drgcodes, on = ['hadm_id', 'subject_id'], how='left')
df_merged = df_merged.drop(columns=['drg_severity', 'drg_mortality'])

In [16]:
# drop any rows that don't have a drg_code
df_merged = df_merged.dropna(subset=['drg_code'], how='any')

In [17]:
# select rows where DRG code contains "hf" or "heart failure"
df_sub = df_merged[df_merged['description'].str.contains('hf|heart failure', case=False)]
# there are some drg codes that specify "w/o hf" or "w/o heart failure" - don't include those rows
df_sub = df_sub[~df_sub['description'].str.contains('W/O AMI/HF/SHOCK|W/O AMI, HEART FAILURE OR SHOCK', case=False)]

In [18]:
# remove the hadm_ids that have a long title with heart failure in them, since the ICD codes already account for them
no_hf_icd = []
for hadm_id in df_sub.hadm_id.unique():
    df_hadm_id = df_sub[df_sub['hadm_id'] == hadm_id]
    if len(df_hadm_id[df_hadm_id['long_title'].str.contains('heart failure', case=False)]) == 0:
        no_hf_icd.append(hadm_id)
df_no_hf_icd = df_sub[df_sub['hadm_id'].isin(no_hf_icd)]

In [19]:
# only keep rows where ICD codes include:
#" cardiogenic shock", "left ventricular failure, unspecified", "shock, unspecified", or pleural effusion or fluid overload
df_no_hf_icd = df_no_hf_icd[df_no_hf_icd['long_title'].str.contains('cardiogenic shock|left ventricular failure, unspecified|shock, unspecified|Pleural effusion, not elsewhere classified|Fluid overload, unspecified', case=False)]
df_no_hf_icd_hadm = df_no_hf_icd[['subject_id','hadm_id','long_title', 'icd_code','icd_version', 'description']].drop_duplicates(subset=['hadm_id'])

## Part IV: IV Diuretics

In [20]:
chf_hadm = df_f7_hadm.hadm_id.append(df_no_hf_icd_hadm.hadm_id).unique() # 51322 hadm
len(chf_hadm)

51322

In [21]:
prescriptions = pd.read_csv(f'{mimiciv_dir}/prescriptions.csv', low_memory=False)

In [22]:
df_diuretics = pd.DataFrame({'furosemide': (prescriptions['drug'].str.contains('furosemide', case=False, na=False) | prescriptions['drug'].str.contains('lasix', case=False, na=False)) & ~prescriptions['drug'].str.contains('Desensitization', case=False, na=False), 
                   'bumetanide': (prescriptions['drug'].str.contains('bumetanide', case=False, na=False) | prescriptions['drug'].str.contains('bumex', case=False, na=False)) & ~prescriptions['drug'].str.contains('Desensitization', case=False, na=False), 
                   'chlorothiazide': (prescriptions['drug'].str.contains('chlorothiazide', case=False, na=False) | prescriptions['drug'].str.contains('diuril', case=False, na=False)) & ~prescriptions['drug'].str.contains('Desensitization', case=False, na=False) & ~prescriptions['drug'].str.contains('hydrochlorothiazide', case=False, na=False)})

In [23]:
# prescriptions for diuretics
given_diuretic = df_diuretics.any(axis=1)

In [24]:
# prescriptions with hadm_id associated w heart failure ICD
in_icd = np.isin(prescriptions.hadm_id, chf_hadm)

In [25]:
# prescriptions given via IV
route_iv = prescriptions.route.str.contains('IV', case=False, na=False)

In [26]:
# final cohort with all prescription fields
cohort = prescriptions[given_diuretic & in_icd & route_iv]

In [27]:
# unique drug names to confirm no unwanted drugs are included
cohort.drug.unique()

array(['Furosemide', 'Chlorothiazide Sodium', 'Furosemide-Heart Failure',
       'Chlorothiazide', 'Bumetanide', 'Furosemide (Latex Free)', 'LaSIX',
       'bumex'], dtype=object)

In [28]:
cohort.hadm_id.unique().shape[0] #25691 unique HADM_IDs

25691

In [41]:
cohort.subject_id.unique().shape[0] #14757 unique SUBJECT_IDs

14757

In [30]:
cohort_hadm = cohort.hadm_id.unique()

## Part V: Removing Dialysis Patients 

In [31]:
dialysis_codes = ["Z992", "N186", "5856", "V4511"]

In [32]:
df_event["chronic_dialysis"] = df_event["icd_code"].str.strip().isin(dialysis_codes)

In [33]:
df_event

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,chronic_dialysis
0,15734973,20475282,3,2825,9,False
1,15734973,20475282,2,V0251,9,False
2,15734973,20475282,5,V270,9,False
3,15734973,20475282,1,64891,9,False
4,15734973,20475282,4,66481,9,False
5,11442057,21518990,1,65971,9,False
6,11442057,21518990,2,64231,9,False
7,11442057,21518990,6,V270,9,False
8,11442057,21518990,3,64511,9,False
9,11442057,21518990,5,65961,9,False


## Part VI: Final Cohort

In [34]:
final_cohort = df_event.drop(columns=['seq_num', 'icd_code', 'icd_version'])
final_cohort.head()

Unnamed: 0,subject_id,hadm_id,chronic_dialysis
0,15734973,20475282,False
1,15734973,20475282,False
2,15734973,20475282,False
3,15734973,20475282,False
4,15734973,20475282,False


In [35]:
final_cohort['CHF exacerbation'] = final_cohort['hadm_id'].isin(cohort_hadm)

In [36]:
final_cohort = final_cohort.drop_duplicates()

In [37]:
final_cohort.head()

Unnamed: 0,subject_id,hadm_id,chronic_dialysis,CHF exacerbation
0,15734973,20475282,False,False
5,11442057,21518990,False,False
11,10072949,20817034,False,False
16,13538405,24822466,False,False
21,10287061,27485182,False,False


In [38]:
len(final_cohort)

536165

In [39]:
final_cohort.to_csv('final_cohort.csv')

In [56]:
sum(final_cohort['chronic_dialysis']==False)

521111

In [64]:
final_cohort_selected = final_cohort[(final_cohort['chronic_dialysis']==False) & (final_cohort['CHF exacerbation']==True)]
final_cohort_selected.head()
final_cohort_selected.to_csv('final_cohort_selected.csv')

In [65]:
len(final_cohort_selected)

25691