# Data Loading and Exploration

In [224]:
import pandas as pd
import datetime
from dateutil import parser
import numpy as np

In [225]:
mimiciv_dir = '../../mimic_iv/core/'
hosp_dir = '../../mimic_iv/hosp/'

In [226]:
all_admissions = pd.read_csv(f'{mimiciv_dir}admissions.csv', low_memory=False)
all_admissions[:10]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
1,14029832,22059088,2120-01-18 01:28:00,2120-01-20 16:13:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,,OTHER,,,0
2,14495017,22484010,2175-01-28 15:41:00,2175-01-29 16:00:00,,DIRECT EMER.,PHYSICIAN REFERRAL,HOME,Other,?,,WHITE,,,0
3,13676048,23865469,2193-01-19 05:27:00,2193-01-24 18:59:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,WHITE,,,0
4,13831972,27763544,2131-01-27 04:03:00,2131-01-27 05:39:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicaid,ENGLISH,SINGLE,WHITE,2131-01-26 22:19:00,2131-01-27 05:39:00,0
5,18523038,25414328,2142-08-26 17:14:00,2142-08-27 10:00:00,,DIRECT OBSERVATION,PROCEDURE SITE,,Other,ENGLISH,SINGLE,WHITE,,,0
6,16705931,20580522,2174-10-24 11:30:00,2174-10-24 18:45:00,,DIRECT OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,,,0
7,19963742,24951506,2171-07-04 15:58:00,2171-07-05 13:37:00,,AMBULATORY OBSERVATION,PACU,,Other,ENGLISH,SINGLE,UNKNOWN,,,0
8,10903424,22568585,2181-01-31 13:09:00,2181-01-31 13:42:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0
9,15030422,21975876,2149-09-21 23:54:00,2149-09-22 03:59:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0


In [227]:
all_admissions.shape

(524520, 15)

In [228]:
patients = pd.read_csv(f'{mimiciv_dir}patients.csv', low_memory=False)
patients[:10]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10003939,M,0,2184,2008 - 2010,
2,10004222,M,0,2161,2014 - 2016,
3,10005325,F,0,2154,2011 - 2013,
4,10007338,F,0,2153,2017 - 2019,
5,10008101,M,0,2142,2008 - 2010,
6,10009872,F,0,2168,2014 - 2016,
7,10011333,F,0,2132,2014 - 2016,
8,10011879,M,0,2158,2014 - 2016,
9,10012663,F,0,2171,2011 - 2013,


# Adding in Heart Failure Information

In [229]:
hf_info = pd.read_csv(f'../phase1_teamA/final_cohort.csv')
hf_info.rename(columns = {'CHF exacerbation': 'heart_failure'}, inplace = True)
hf_info.heart_failure.value_counts(dropna=False)

False    495420
True      25691
Name: heart_failure, dtype: int64

In [230]:
admissions = all_admissions.merge(hf_info, on=['hadm_id', 'subject_id'], how = 'left')

In [231]:
admissions['heart_failure'] = np.where(admissions['heart_failure'] == True, 1, 0)
admissions.heart_failure.value_counts()

0    498858
1     25662
Name: heart_failure, dtype: int64

In [232]:
admissions.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime',
       'edouttime', 'hospital_expire_flag', 'Unnamed: 0', 'chronic_dialysis',
       'heart_failure'],
      dtype='object')

# Adding Readmission Label

In [233]:
# Sort by subject then time (assumes all admittime, dischtime ranges are exclusive)
admissions = admissions.sort_values(by=["subject_id", "admittime"])

# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)

# For temporary ease of understanding, append the array to itself, shifted up one on the right half
admissions_combined = pd.concat([admissions, admissions.shift(-1).rename(lambda x: str(x) + "_shifted", axis="columns")], axis=1)

# Whether or not that visit was before a readmission of that patient (note the last visit is false)
admissions_combined['was_readmitted'] = (admissions_combined['subject_id'] == admissions_combined['subject_id_shifted']) & (admissions_combined['hadm_id']!=admissions_combined['hadm_id_shifted'])

# This creates a new column that's the difference in dates between admissions, regardless of if its the same patient
admissions_combined['readmission_hours'] = (admissions_combined['admittime_shifted'] - admissions_combined['dischtime']).fillna(pd.Timedelta(seconds=-1))
admissions_combined['readmission_hours'] = admissions_combined['readmission_hours'].dt.total_seconds()/3600
admissions_combined['readmission_hours'] = admissions_combined['was_readmitted'] * admissions_combined['readmission_hours']


In [234]:
# adding in an admission number because maybe we only want to look at first readmission
admissions_combined['counter'] = 1
admissions_combined = admissions_combined.sort_values(by=["subject_id", "admittime"])
admissions_combined['readmission_num'] = admissions_combined.groupby(['subject_id']).counter.cumsum() 

In [235]:
#only limit to patients who had heart failure on the current admission by combining with admissions cohort of patients with hf on first visit
admissions = admissions[admissions['heart_failure'] == 1]
admissions = admissions.merge(admissions_combined[['readmission_num','readmission_hours', 'was_readmitted', 'heart_failure_shifted', 'hadm_id']], on='hadm_id', how = 'left')
admissions.rename(columns={'heart_failure_shifted':'readmitted_hf'}, inplace = True)

# All hadm_ids belonging to people who were readmitted, fitler to last 10 for brevity of printing
admissions[(admissions['was_readmitted']==True) | (admissions.shift(-1)['was_readmitted']==True)][-10:]

Unnamed: 0.1,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,...,edregtime,edouttime,hospital_expire_flag,Unnamed: 0,chronic_dialysis,heart_failure,readmission_num,readmission_hours,was_readmitted,readmitted_hf
25647,19997293,20208898,2123-10-12 13:46:00,2123-10-30 15:13:00,,URGENT,TRANSFER FROM HOSPITAL,SKILLED NURSING FACILITY,Medicare,ENGLISH,...,,,0,1600176.0,False,1,1,56.216667,True,0.0
25648,19997293,28847872,2123-12-29 01:47:00,2124-01-10 16:30:00,,URGENT,TRANSFER FROM HOSPITAL,CHRONIC/LONG TERM ACUTE CARE,Medicare,ENGLISH,...,,,0,1688670.0,False,1,4,0.0,False,0.0
25649,19997367,21508795,2127-04-02 01:03:00,2127-04-17 15:05:00,,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,...,2127-04-01 21:05:00,2127-04-02 02:16:00,0,2411997.0,False,1,5,419.583333,True,0.0
25650,19997367,22967208,2127-05-24 18:33:00,2127-05-27 15:30:00,,URGENT,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Medicare,ENGLISH,...,2127-05-24 08:47:00,2127-05-24 19:45:00,0,2485044.0,False,1,7,2000.833333,True,0.0
25651,19997367,29933340,2128-01-17 21:25:00,2128-01-21 17:00:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-01-17 16:44:00,2128-01-17 23:03:00,0,2829725.0,False,1,9,123.35,True,1.0
25652,19997367,22314636,2128-01-26 20:21:00,2128-01-31 16:00:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-01-26 16:08:00,2128-01-27 01:20:00,0,2844186.0,False,1,10,217.55,True,1.0
25653,19997367,21009481,2128-02-09 17:33:00,2128-02-15 15:15:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-02-09 00:10:00,2128-02-09 18:22:00,0,2865048.0,False,1,11,98.416667,True,1.0
25657,19997752,29452285,2128-02-28 21:28:00,2128-03-10 14:35:00,,URGENT,TRANSFER FROM HOSPITAL,SKILLED NURSING FACILITY,Other,ENGLISH,...,,,0,3631252.0,False,1,1,0.0,False,0.0
25658,19998330,24492004,2178-10-01 07:28:00,2178-10-08 17:40:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Other,?,...,2178-10-01 05:15:00,2178-10-01 08:51:00,0,1549765.0,False,1,3,45.716667,True,0.0
25659,19998330,21135114,2178-10-21 15:14:00,2178-10-25 17:27:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Other,ENGLISH,...,2178-10-21 14:16:00,2178-10-21 17:20:00,0,1571295.0,False,1,5,796.4,True,1.0


In [236]:
admissions.shape

(25662, 22)

In [237]:
admissions.readmission_hours.value_counts()
# if readmission_hours is 0, that means they were not readmitted

-0.000000       9370
 0.016667          5
 242.483333        4
 125.033333        4
 141.416667        4
                ... 
 1446.016667       1
 1633.983333       1
 11.433333         1
 2819.450000       1
 509.450000        1
Name: readmission_hours, Length: 15584, dtype: int64

# Adding in Emergency Department Information

In [238]:
admissions.admission_location.value_counts()

EMERGENCY ROOM                            14360
PHYSICIAN REFERRAL                         4746
TRANSFER FROM HOSPITAL                     4228
WALK-IN/SELF REFERRAL                       807
CLINIC REFERRAL                             510
TRANSFER FROM SKILLED NURSING FACILITY      491
PROCEDURE SITE                              400
PACU                                         87
INFORMATION NOT AVAILABLE                    17
AMBULATORY SURGERY TRANSFER                   9
INTERNAL TRANSFER TO OR FROM PSYCH            2
Name: admission_location, dtype: int64

In [239]:
#were they readmitted in the ER
#admissions_combined.drop('readmit_to_emergency', inplace = True)
admissions_combined['readmit_to_emergency'] = np.where(admissions_combined['admission_location_shifted'].notnull() & admissions_combined['admission_location_shifted'].str.contains('EMERGENCY'), 1, 0)
print(admissions_combined.readmit_to_emergency.value_counts())
emergency_df = admissions_combined[['hadm_id', 'readmit_to_emergency']]
admissions = admissions.merge(emergency_df, how ='left', on = ['hadm_id'])

admissions[['subject_id', 'readmit_to_emergency', 'admission_location']].head()

0    278279
1    246241
Name: readmit_to_emergency, dtype: int64


Unnamed: 0,subject_id,readmit_to_emergency,admission_location
0,10000980,1,EMERGENCY ROOM
1,10000980,1,EMERGENCY ROOM
2,10000980,1,EMERGENCY ROOM
3,10000980,1,EMERGENCY ROOM
4,10000980,0,EMERGENCY ROOM


In [240]:
admissions.fillna(0, inplace = True)
admissions.head()

Unnamed: 0.1,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,...,edouttime,hospital_expire_flag,Unnamed: 0,chronic_dialysis,heart_failure,readmission_num,readmission_hours,was_readmitted,readmitted_hf,readmit_to_emergency
0,10000980,29654838,2188-01-03 17:41:00,2188-01-05 17:30:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2188-01-03 18:42:00,0,1555741.0,False,1,1,12926.133333,True,1.0,1
1,10000980,26913865,2189-06-27 07:38:00,2189-07-03 03:00:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2189-06-27 08:42:00,0,2160674.0,False,1,2,11801.95,True,1.0,1
2,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2190-11-06 23:16:00,0,2826374.0,False,1,3,3506.833333,True,0.0,1
3,10000980,25911675,2191-05-23 15:33:00,2191-05-24 17:14:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2191-05-23 17:56:00,0,3116560.0,False,1,5,1269.116667,True,1.0,1
4,10000980,29659838,2191-07-16 14:21:00,2191-07-19 13:03:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2191-07-16 16:22:00,0,3186952.0,False,1,6,18179.966667,True,1.0,0


In [241]:
# sanity check to make sure that admit_to_emergency is only true when admit location is ER
admissions[admissions.readmit_to_emergency == 1].admission_location.value_counts(dropna=False)

EMERGENCY ROOM                            9372
TRANSFER FROM HOSPITAL                    1637
PHYSICIAN REFERRAL                        1615
TRANSFER FROM SKILLED NURSING FACILITY     201
CLINIC REFERRAL                            175
PROCEDURE SITE                             146
WALK-IN/SELF REFERRAL                      144
PACU                                        40
INFORMATION NOT AVAILABLE                    6
AMBULATORY SURGERY TRANSFER                  3
0                                            3
Name: admission_location, dtype: int64

In [242]:
admissions.readmit_to_emergency.value_counts(dropna=False)

1    13342
0    12320
Name: readmit_to_emergency, dtype: int64

# Expanding to 8 Output Channels

Output Channels to Add

1) 48h readmission due to HF exacerbation

2) 14-day readmission due to HF exacerbation

3) 30-day readmission due to HF exacerbation

4) ED visit due to HF exacerbation

5) 48h readmission due to any reason

6) 14-day readmission due to any reason

7) 30-day readmission due to any reason

8) ED visit due to any reason


In [243]:
admissions['48h_hf'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['48h_hf'] == 1), 1, 0)
admissions['48h_hf'].value_counts()

0    25442
1      220
Name: 48h_hf, dtype: int64

In [244]:
admissions['14d_hf'] = np.where((admissions.readmission_hours <=14*24) & (admissions.readmission_hours >0), 1, 0)
admissions['14d_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['14d_hf'] == 1), 1, 0)
admissions['14d_hf'].value_counts()

0    23944
1     1718
Name: 14d_hf, dtype: int64

In [245]:
admissions['30d_hf'] = np.where((admissions.readmission_hours <=30*24) & (admissions.readmission_hours >0), 1, 0)
admissions['30d_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['30d_hf'] == 1), 1, 0)
admissions['30d_hf'].value_counts()

0    22794
1     2868
Name: 30d_hf, dtype: int64

In [246]:
admissions['er_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions.readmit_to_emergency == 1), 1, 0)
admissions['er_hf'].value_counts()

0    21384
1     4278
Name: er_hf, dtype: int64

In [247]:
admissions['48h'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h'].value_counts()

0    25142
1      520
Name: 48h, dtype: int64

In [248]:
admissions['14d'] = np.where((admissions.readmission_hours <=14*24)&(admissions.readmission_hours >0), 1, 0)
admissions['14d'].value_counts()

0    21928
1     3734
Name: 14d, dtype: int64

In [249]:
admissions['30d'] = np.where((admissions.readmission_hours <=30*24)&(admissions.readmission_hours >0), 1, 0)
admissions['30d'].value_counts()

0    19525
1     6137
Name: 30d, dtype: int64

In [250]:
admissions['er'] = np.where(admissions.readmit_to_emergency == 1, 1, 0)
admissions['er'].value_counts()

1    13342
0    12320
Name: er, dtype: int64

In [251]:
#dropping unnecessary variables
admissions.drop(['readmitted_hf', 'was_readmitted','readmit_to_emergency', 'readmission_hours'], inplace = True, axis = 1)

In [252]:
def print_stats(df, var_name, extra_info = False):
    print('--------------------Current cohort %s --------------------' %var_name)
    print(sum(admissions[var_name]))
    print(len(admissions[var_name]))
    print('Number of patients: %f' %(sum(admissions[var_name])/float(len(admissions[var_name]))))
    if extra_info:
        cohort = df[df[var_name] == 1]
        print('Gender Breakdown: \n%s' %str(cohort.gender.value_counts(normalize=True))+'\n')
        print('Insurance Breakdown: \n%s' %str(cohort.insurance.value_counts(normalize=True))+'\n')
        print('Ethnicity Breakdown: \n%s' %str(cohort.ethnicity.value_counts(normalize=True))+'\n')

In [253]:
#printing final statistics
patients = patients[['subject_id', 'gender']]
stats = admissions.merge(patients, on = ['subject_id'], how = 'left')
print("Total Number of patients %d: " %stats.shape[0])
for var in ['48h_hf', '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er']:
    print_stats(stats, var, extra_info = True)

Total Number of patients 25662: 
--------------------Current cohort 48h_hf --------------------
220
25662
Number of patients: 0.008573
Gender Breakdown: 
M    0.572727
F    0.427273
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.677273
Other       0.268182
Medicaid    0.054545
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                     0.740909
BLACK/AFRICAN AMERICAN    0.122727
HISPANIC/LATINO           0.054545
OTHER                     0.040909
ASIAN                     0.022727
UNABLE TO OBTAIN          0.009091
UNKNOWN                   0.009091
Name: ethnicity, dtype: float64

--------------------Current cohort 14d_hf --------------------
1718
25662
Number of patients: 0.066947
Gender Breakdown: 
M    0.526193
F    0.473807
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.624563
Other       0.324214
Medicaid    0.051222
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.716531
BLACK/AF

In [254]:
#printing stats for total population
print('Gender Breakdown: \n%s' %str(stats.gender.value_counts(normalize=True))+'\n')
print('Insurance Breakdown: \n%s' %str(stats.insurance.value_counts(normalize=True))+'\n')
print('Ethnicity Breakdown: \n%s' %str(stats.ethnicity.value_counts(normalize=True))+'\n')
print(admissions.shape)

Gender Breakdown: 
M    0.518705
F    0.481295
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.621620
Other       0.335126
Medicaid    0.043255
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.715104
BLACK/AFRICAN AMERICAN           0.145936
HISPANIC/LATINO                  0.036747
OTHER                            0.035578
UNKNOWN                          0.035188
ASIAN                            0.023771
UNABLE TO OBTAIN                 0.005300
AMERICAN INDIAN/ALASKA NATIVE    0.002377
Name: ethnicity, dtype: float64

(25662, 27)
