# Data Loading and Exploration

In [89]:
import pandas as pd
import datetime
from dateutil import parser
import numpy as np

In [90]:
mimiciv_dir = '../../mimic_iv/core/'
hosp_dir = '../../mimic_iv/hosp/'

In [91]:
admissions = pd.read_csv(f'{mimiciv_dir}admissions.csv', low_memory=False)
admissions[:10]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
1,14029832,22059088,2120-01-18 01:28:00,2120-01-20 16:13:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,,OTHER,,,0
2,14495017,22484010,2175-01-28 15:41:00,2175-01-29 16:00:00,,DIRECT EMER.,PHYSICIAN REFERRAL,HOME,Other,?,,WHITE,,,0
3,13676048,23865469,2193-01-19 05:27:00,2193-01-24 18:59:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,WHITE,,,0
4,13831972,27763544,2131-01-27 04:03:00,2131-01-27 05:39:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicaid,ENGLISH,SINGLE,WHITE,2131-01-26 22:19:00,2131-01-27 05:39:00,0
5,18523038,25414328,2142-08-26 17:14:00,2142-08-27 10:00:00,,DIRECT OBSERVATION,PROCEDURE SITE,,Other,ENGLISH,SINGLE,WHITE,,,0
6,16705931,20580522,2174-10-24 11:30:00,2174-10-24 18:45:00,,DIRECT OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,,,0
7,19963742,24951506,2171-07-04 15:58:00,2171-07-05 13:37:00,,AMBULATORY OBSERVATION,PACU,,Other,ENGLISH,SINGLE,UNKNOWN,,,0
8,10903424,22568585,2181-01-31 13:09:00,2181-01-31 13:42:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0
9,15030422,21975876,2149-09-21 23:54:00,2149-09-22 03:59:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0


In [92]:
patients = pd.read_csv(f'{mimiciv_dir}patients.csv', low_memory=False)
patients[:10]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10003939,M,0,2184,2008 - 2010,
2,10004222,M,0,2161,2014 - 2016,
3,10005325,F,0,2154,2011 - 2013,
4,10007338,F,0,2153,2017 - 2019,
5,10008101,M,0,2142,2008 - 2010,
6,10009872,F,0,2168,2014 - 2016,
7,10011333,F,0,2132,2014 - 2016,
8,10011879,M,0,2158,2014 - 2016,
9,10012663,F,0,2171,2011 - 2013,


In [93]:
admissions[admissions.subject_id == 12427812].head(40)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
10253,12427812,23948770,2185-01-20 00:08:00,2185-01-21 11:45:00,,EU OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,2185-01-19 18:58:00,2185-01-20 01:34:00,0


In [96]:
d_icd_diagnoses = pd.read_csv(f'{hosp_dir}d_icd_diagnoses.csv', low_memory=False)
d_icd_diagnoses[:10]

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A
5,22,9,Paratyphoid fever B
6,23,9,Paratyphoid fever C
7,29,9,"Paratyphoid fever, unspecified"
8,30,9,Salmonella gastroenteritis
9,31,9,Salmonella septicemia


# Adding Readmission Label

In [97]:
# Sort by subject then time (assumes all admittime, dischtime ranges are exclusive)
admissions = admissions.sort_values(by=["subject_id", "admittime"])

# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)

# For temporary ease of understanding, append the array to itself, shifted up one on the right half
admissions_combined = pd.concat([admissions, admissions.shift(-1).rename(lambda x: str(x) + "_shifted", axis="columns")], axis=1)

# Whether or not that visit was before a readmission of that patient (note the last visit is false)
admissions_combined['was_readmitted'] = (admissions_combined['subject_id'] == admissions_combined['subject_id_shifted']) & (admissions_combined['hadm_id']!=admissions_combined['hadm_id_shifted'])

# This creates a new column that's the difference in dates between admissions, regardless of if its the same patient
admissions_combined['readmission_hours'] = (admissions_combined['admittime_shifted'] - admissions_combined['dischtime']).fillna(pd.Timedelta(seconds=-1))
admissions_combined['readmission_hours'] = admissions_combined['readmission_hours'].dt.total_seconds()/60
admissions_combined['readmission_hours'] = admissions_combined['was_readmitted'] * admissions_combined['readmission_hours']
admissions['readmission_hours'] = admissions_combined['readmission_hours']
admissions['was_readmitted'] = admissions_combined['was_readmitted']

# All hadm_ids belonging to people who were readmitted, fitler to last 10 for brevity of printing
admissions[(admissions['was_readmitted']==True) | (admissions.shift(-1)['was_readmitted']==True)][-10:]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,readmission_hours,was_readmitted
187950,19999784,27192150,2120-01-26 00:00:00,2120-02-01 12:15:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,77618.0,True
186117,19999784,27319264,2120-03-26 09:53:00,2120-04-01 13:00:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,81847.0,True
187424,19999784,29324445,2120-05-28 09:07:00,2120-06-02 08:55:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,77783.0,True
168474,19999784,23406899,2120-07-26 09:18:00,2120-07-31 18:15:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,123328.0,True
178803,19999784,29889147,2120-10-25 09:43:00,2120-10-31 09:00:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,131940.0,True
179787,19999784,29956342,2121-01-31 00:00:00,2121-02-05 12:44:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,163958.0,True
187505,19999784,24755486,2121-05-30 09:22:00,2121-06-05 08:48:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,0.0,False
172700,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,,EW EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2147-07-17 17:18:00,2147-07-18 17:34:00,0,753034.0,True
175560,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,,EW EMER.,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,0.0,False
387927,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,,EW EMER.,EMERGENCY ROOM,HOME,Other,ENGLISH,WIDOWED,WHITE,2164-07-24 21:16:00,2164-07-25 01:20:00,0,63452.0,True


In [98]:
# adding in an admission number because maybe we only want to look at first readmission
admissions['counter'] = 1
admissions = admissions.sort_values(by=["subject_id", "admittime"])
admissions['readmission_num'] = admissions.groupby(['subject_id']).counter.cumsum() 

In [105]:
# sanity checks to make sure that readmission_num flag is correct
admissions[admissions.subject_id == 12427812][['readmission_num']][-10:]
admissions[admissions.readmission_num>3]
admissions[admissions.subject_id == 10000032].head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,...,ethnicity,edregtime,edouttime,hospital_expire_flag,readmission_hours,was_readmitted,counter,readmission_num,admit_to_emergency,48h_hf
446266,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,...,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0,72072.0,True,1,1,0,0
451986,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,...,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0,37066.0,True,1,2,1,0
423408,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,...,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0,16189.0,True,1,3,1,0
455511,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,...,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0,-0.0,False,1,4,1,0


In [106]:
admissions.readmission_hours.value_counts()

# if readmission_hours is 0, that means they were not readmitted

0.0          259223
1.0            3599
2.0            1078
3.0             360
4.0             196
              ...  
17404.0           1
1113840.0         1
146642.0          1
69613.0           1
130815.0          1
Name: readmission_hours, Length: 188653, dtype: int64

# Adding in Heart Failure Information

In [107]:
##inner join admissions with diagnoses_icd
diagnosis_icd = pd.read_csv(f'{hosp_dir}diagnoses_icd.csv', low_memory=False)
# merge admissions with diagnosis
admissions = pd.merge(left = admissions, right = diagnosis_icd, how = 'inner', on = ['subject_id','hadm_id'], suffixes =('_adm', '_diag'))

#from Group A
chf_icd = pd.read_csv('CHF_ICD.csv', low_memory=False)

# for comparison
chf_icd['icd_code'] = chf_icd['icd_code'].apply(str)
admissions = admissions.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
chf_icd = chf_icd.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
admissions = pd.merge(left = admissions, right = chf_icd, how = 'left', on = ['icd_code', 'icd_version'], indicator = "heart_failure", suffixes= ('_adm', '_diag'))


In [108]:
admissions['heart_failure'] = [1 if x == 'both' else 0 for x in admissions['heart_failure']]
print(admissions['heart_failure'].value_counts())

0    4624521
1      53403
Name: heart_failure, dtype: int64


# Adding in Emergency Department Information

In [109]:
admissions.admission_location.value_counts()

EMERGENCY ROOM                            2216126
PHYSICIAN REFERRAL                        1157531
TRANSFER FROM HOSPITAL                     545127
WALK-IN/SELF REFERRAL                      201537
CLINIC REFERRAL                            117237
TRANSFER FROM SKILLED NURSING FACILITY      74938
PROCEDURE SITE                              66193
PACU                                        46364
INTERNAL TRANSFER TO OR FROM PSYCH          40586
INFORMATION NOT AVAILABLE                    5399
AMBULATORY SURGERY TRANSFER                  2909
Name: admission_location, dtype: int64

In [110]:
admissions['admit_to_emergency'] = np.where(admissions.admission_location.notnull() & admissions.admission_location.str.contains('EMERGENCY'), 1, 0)
admissions[['subject_id', 'admit_to_emergency', 'admission_location']].head()

Unnamed: 0,subject_id,admit_to_emergency,admission_location
0,10000032,0,TRANSFER FROM HOSPITAL
1,10000032,0,TRANSFER FROM HOSPITAL
2,10000032,0,TRANSFER FROM HOSPITAL
3,10000032,0,TRANSFER FROM HOSPITAL
4,10000032,0,TRANSFER FROM HOSPITAL


In [111]:
# sanity check to make sure that admit_to_emergency is only true when admit location is ER
admissions[admissions.admit_to_emergency == 1].admission_location.value_counts(dropna=False)

EMERGENCY ROOM    2216126
Name: admission_location, dtype: int64

In [112]:
admissions.admit_to_emergency.value_counts(dropna=False)

0    2461798
1    2216126
Name: admit_to_emergency, dtype: int64

# Expanding to 8 Output Channels

Output Channels to Add

1) 48h readmission due to HF exacerbation

2) 14-day readmission due to HF exacerbation

3) 30-day readmission due to HF exacerbation

4) ED visit due to HF exacerbation

5) 48h readmission due to any reason

6) 14-day readmission due to any reason

7) 30-day readmission due to any reason

8) ED visit due to any reason


In [113]:
admissions['48h_hf'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h_hf'] = np.where((admissions.heart_failure == 1) & (admissions['48h_hf'] == 1), 1, 0)
admissions['48h_hf'].value_counts()

0    4677859
1         65
Name: 48h_hf, dtype: int64

In [115]:
admissions['14d_hf'] = np.where((admissions.readmission_hours <=14*24) & (admissions.readmission_hours >0), 1, 0)
admissions['14d_hf'] = np.where((admissions.heart_failure == 1) & (admissions['14d_hf'] == 1), 1, 0)
admissions['14d_hf'].value_counts()

0    4677816
1        108
Name: 14d_hf, dtype: int64

In [116]:
admissions['30d_hf'] = np.where((admissions.readmission_hours <=30*24) & (admissions.readmission_hours >0), 1, 0)
admissions['30d_hf'] = np.where((admissions.heart_failure == 1) & (admissions['30d_hf'] == 1), 1, 0)
admissions['30d_hf'].value_counts()

0    4677683
1        241
Name: 30d_hf, dtype: int64

In [117]:
admissions['er_hf'] = np.where((admissions.heart_failure == 1) & (admissions.admit_to_emergency == 1), 1, 0)
admissions['er_hf'].value_counts()

0    4645412
1      32512
Name: er_hf, dtype: int64

In [118]:
admissions['48h'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h'].value_counts()

0    4642615
1      35309
Name: 48h, dtype: int64

In [119]:
admissions['14d'] = np.where((admissions.readmission_hours <=14*24)&(admissions.readmission_hours >0), 1, 0)
admissions['14d'].value_counts()

0    4638138
1      39786
Name: 14d, dtype: int64

In [120]:
admissions['30d'] = np.where((admissions.readmission_hours <=30*24)&(admissions.readmission_hours >0), 1, 0)
admissions['30d'].value_counts()

0    4626618
1      51306
Name: 30d, dtype: int64

In [121]:
admissions['er'] = np.where(admissions.admit_to_emergency == 1, 1, 0)
admissions['er'].value_counts()

0    2461798
1    2216126
Name: er, dtype: int64

In [122]:
#dropping unnecessary variables
admissions.drop(['heart_failure', 'admit_to_emergency', 'readmission_hours'], inplace = True, axis = 1)

In [129]:
def print_stats(df, var_name, extra_info = False):
    print('--------------------Current cohort %s --------------------' %var_name)
    print(sum(admissions[var_name]))
    print(len(admissions[var_name]))
    print('Number of patients: %f' %(sum(admissions[var_name])/float(len(admissions[var_name]))))
    if extra_info:
        cohort = df[df[var_name] == 1]
        print('Gender Breakdown: \n%s' %str(cohort.gender.value_counts(normalize=True))+'\n')
        print('Insurance Breakdown: \n%s' %str(cohort.insurance.value_counts(normalize=True))+'\n')
        print('Ethnicity Breakdown: \n%s' %str(cohort.ethnicity.value_counts(normalize=True))+'\n')

In [130]:
#printing final statistics
patients = patients[['subject_id', 'gender']]
stats = admissions.merge(patients, on = ['subject_id'], how = 'left')
print("Total Number of patients %d: " %stats.shape[0])
for var in ['48h_hf', '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er']:
    print_stats(stats, var, extra_info = True)

Total Number of patients 4677924: 
--------------------Current cohort 48h_hf --------------------
65
4677924
Number of patients: 0.000014
Gender Breakdown: 
M    0.500614
F    0.499386
Name: gender, dtype: float64

Insurance Breakdown: 
Other       0.485436
Medicare    0.437801
Medicaid    0.076763
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.673742
BLACK/AFRICAN AMERICAN           0.152221
HISPANIC/LATINO                  0.050064
OTHER                            0.046508
UNKNOWN                          0.037493
ASIAN                            0.033278
UNABLE TO OBTAIN                 0.003654
AMERICAN INDIAN/ALASKA NATIVE    0.003040
Name: ethnicity, dtype: float64

--------------------Current cohort 14d_hf --------------------
108
4677924
Number of patients: 0.000023
Gender Breakdown: 
M    0.500614
F    0.499386
Name: gender, dtype: float64

Insurance Breakdown: 
Other       0.485436
Medicare    0.437801
Medicaid    0.076763
Name: insu

In [133]:
#printing stats for total population
print('Gender Breakdown: \n%s' %str(stats.gender.value_counts(normalize=True))+'\n')
print('Insurance Breakdown: \n%s' %str(stats.insurance.value_counts(normalize=True))+'\n')
print('Ethnicity Breakdown: \n%s' %str(stats.ethnicity.value_counts(normalize=True))+'\n')

Gender Breakdown: 
M    0.500614
F    0.499386
Name: gender, dtype: float64

Insurance Breakdown: 
Other       0.485436
Medicare    0.437801
Medicaid    0.076763
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.673742
BLACK/AFRICAN AMERICAN           0.152221
HISPANIC/LATINO                  0.050064
OTHER                            0.046508
UNKNOWN                          0.037493
ASIAN                            0.033278
UNABLE TO OBTAIN                 0.003654
AMERICAN INDIAN/ALASKA NATIVE    0.003040
Name: ethnicity, dtype: float64

