# Data Loading and Exploration

In [1]:
import pandas as pd
import datetime
from dateutil import parser
import numpy as np

In [2]:
mimiciv_dir = '../../mimic_iv/core/'
hosp_dir = '../../mimic_iv/hosp/'

In [3]:
all_admissions = pd.read_csv(f'{mimiciv_dir}admissions.csv', low_memory=False)
all_admissions[:10]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
1,14029832,22059088,2120-01-18 01:28:00,2120-01-20 16:13:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,,OTHER,,,0
2,14495017,22484010,2175-01-28 15:41:00,2175-01-29 16:00:00,,DIRECT EMER.,PHYSICIAN REFERRAL,HOME,Other,?,,WHITE,,,0
3,13676048,23865469,2193-01-19 05:27:00,2193-01-24 18:59:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,WHITE,,,0
4,13831972,27763544,2131-01-27 04:03:00,2131-01-27 05:39:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicaid,ENGLISH,SINGLE,WHITE,2131-01-26 22:19:00,2131-01-27 05:39:00,0
5,18523038,25414328,2142-08-26 17:14:00,2142-08-27 10:00:00,,DIRECT OBSERVATION,PROCEDURE SITE,,Other,ENGLISH,SINGLE,WHITE,,,0
6,16705931,20580522,2174-10-24 11:30:00,2174-10-24 18:45:00,,DIRECT OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,,,0
7,19963742,24951506,2171-07-04 15:58:00,2171-07-05 13:37:00,,AMBULATORY OBSERVATION,PACU,,Other,ENGLISH,SINGLE,UNKNOWN,,,0
8,10903424,22568585,2181-01-31 13:09:00,2181-01-31 13:42:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0
9,15030422,21975876,2149-09-21 23:54:00,2149-09-22 03:59:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0


In [4]:
all_admissions.shape

(524520, 15)

In [5]:
patients = pd.read_csv(f'{mimiciv_dir}patients.csv', low_memory=False)
patients[:10]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10003939,M,0,2184,2008 - 2010,
2,10004222,M,0,2161,2014 - 2016,
3,10005325,F,0,2154,2011 - 2013,
4,10007338,F,0,2153,2017 - 2019,
5,10008101,M,0,2142,2008 - 2010,
6,10009872,F,0,2168,2014 - 2016,
7,10011333,F,0,2132,2014 - 2016,
8,10011879,M,0,2158,2014 - 2016,
9,10012663,F,0,2171,2011 - 2013,


# Adding in Heart Failure Information

In [6]:
hf_info = pd.read_csv(f'../phase1_teamA/final_cohort.csv')
hf_info.rename(columns = {'CHF exacerbation': 'heart_failure'}, inplace = True)
hf_info.heart_failure.value_counts(dropna=False)

False    431453
True      11614
Name: heart_failure, dtype: int64

In [7]:
admissions = all_admissions.merge(hf_info, on=['hadm_id', 'subject_id'], how = 'left')

In [8]:
admissions['heart_failure'] = np.where(admissions['heart_failure'] == True, 1, 0)
admissions.heart_failure.value_counts()

0    512906
1     11614
Name: heart_failure, dtype: int64

In [9]:
admissions.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime',
       'edouttime', 'hospital_expire_flag', 'Unnamed: 0', 'chronic_dialysis',
       'heart_failure'],
      dtype='object')

# Adding Readmission Label

In [10]:
# Sort by subject then time (assumes all admittime, dischtime ranges are exclusive)
admissions = admissions.sort_values(by=["subject_id", "admittime"])

# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)

# For temporary ease of understanding, append the array to itself, shifted up one on the right half
admissions_combined = pd.concat([admissions, admissions.shift(-1).rename(lambda x: str(x) + "_shifted", axis="columns")], axis=1)

# Whether or not that visit was before a readmission of that patient (note the last visit is false)
admissions_combined['was_readmitted'] = (admissions_combined['subject_id'] == admissions_combined['subject_id_shifted']) & (admissions_combined['hadm_id']!=admissions_combined['hadm_id_shifted'])

# This creates a new column that's the difference in dates between admissions, regardless of if its the same patient
admissions_combined['readmission_hours'] = (admissions_combined['admittime_shifted'] - admissions_combined['dischtime']).fillna(pd.Timedelta(seconds=-1))
admissions_combined['readmission_hours'] = admissions_combined['readmission_hours'].dt.total_seconds()/3600
admissions_combined['readmission_hours'] = admissions_combined['was_readmitted'] * admissions_combined['readmission_hours']


In [11]:
# adding in an admission number because maybe we only want to look at first readmission
admissions_combined['counter'] = 1
admissions_combined = admissions_combined.sort_values(by=["subject_id", "admittime"])
admissions_combined['readmission_num'] = admissions_combined.groupby(['subject_id']).counter.cumsum() 

In [12]:
#only limit to patients who had heart failure on the current admission by combining with admissions cohort of patients with hf on first visit
admissions = admissions[admissions['heart_failure'] == 1]
admissions = admissions.merge(admissions_combined[['readmission_num','readmission_hours', 'was_readmitted', 'heart_failure_shifted', 'hadm_id']], on='hadm_id', how = 'left')
admissions.rename(columns={'heart_failure_shifted':'readmitted_hf'}, inplace = True)

# All hadm_ids belonging to people who were readmitted, fitler to last 10 for brevity of printing
admissions[(admissions['was_readmitted']==True) | (admissions.shift(-1)['was_readmitted']==True)][-10:]

Unnamed: 0.1,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,...,edregtime,edouttime,hospital_expire_flag,Unnamed: 0,chronic_dialysis,heart_failure,readmission_num,readmission_hours,was_readmitted,readmitted_hf
11600,19993776,24123446,2135-07-05 00:08:00,2135-07-07 12:32:00,,EW EMER.,EMERGENCY ROOM,HOME,Medicare,ENGLISH,...,2135-07-04 20:37:00,2135-07-05 01:14:00,0,2252374.0,False,1,3,363.8,True,0.0
11601,19993951,29858732,2141-08-17 15:58:00,2141-08-23 18:54:00,,DIRECT EMER.,CLINIC REFERRAL,HOME,Medicare,ENGLISH,...,,,0,2978118.0,False,1,6,165.35,True,1.0
11602,19993951,24151632,2141-08-30 16:15:00,2141-09-14 16:30:00,,OBSERVATION ADMIT,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2141-08-30 13:11:00,2141-08-30 17:47:00,0,3001251.0,False,1,7,1258.616667,True,1.0
11603,19993951,28863685,2141-11-06 03:07:00,2141-11-16 16:43:00,,OBSERVATION ADMIT,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2141-11-05 20:36:00,2141-11-06 05:01:00,0,3095799.0,False,1,8,0.0,False,0.0
11604,19994379,21694878,2128-04-29 20:06:00,2128-05-10 16:21:00,,URGENT,TRANSFER FROM HOSPITAL,CHRONIC/LONG TERM ACUTE CARE,Medicare,ENGLISH,...,,,0,2065030.0,False,1,1,26158.9,True,0.0
11605,19994379,27334101,2131-05-30 16:54:00,2131-06-21 17:20:00,,OBSERVATION ADMIT,PHYSICIAN REFERRAL,REHAB,Medicare,ENGLISH,...,2131-05-30 12:24:00,2131-05-30 18:03:00,0,3639435.0,False,1,3,213.0,True,1.0
11606,19994379,23099193,2131-06-30 14:20:00,2131-07-22 14:58:00,,OBSERVATION ADMIT,PHYSICIAN REFERRAL,REHAB,Medicare,ENGLISH,...,,,0,3683537.0,False,1,4,0.0,False,0.0
11607,19997367,29933340,2128-01-17 21:25:00,2128-01-21 17:00:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-01-17 16:44:00,2128-01-17 23:03:00,0,2249108.0,False,1,9,123.35,True,1.0
11608,19997367,22314636,2128-01-26 20:21:00,2128-01-31 16:00:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-01-26 16:08:00,2128-01-27 01:20:00,0,2263484.0,False,1,10,217.55,True,1.0
11609,19997367,21009481,2128-02-09 17:33:00,2128-02-15 15:15:00,,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2128-02-09 00:10:00,2128-02-09 18:22:00,0,2284127.0,False,1,11,98.416667,True,1.0


In [13]:
admissions.shape

(11614, 22)

In [14]:
admissions.readmission_hours.value_counts()
# if readmission_hours is 0, that means they were not readmitted

-0.000000       4701
 0.016667          7
 265.283333        3
 165.950000        3
 242.483333        3
                ... 
 735.950000        1
 337.200000        1
 2228.700000       1
 1297.650000       1
 9236.416667       1
Name: readmission_hours, Length: 6764, dtype: int64

# Adding in Emergency Department Information

In [15]:
admissions.admission_location.value_counts()

EMERGENCY ROOM                            4418
PHYSICIAN REFERRAL                        3082
TRANSFER FROM HOSPITAL                    2434
WALK-IN/SELF REFERRAL                      811
TRANSFER FROM SKILLED NURSING FACILITY     373
CLINIC REFERRAL                            304
PROCEDURE SITE                             132
PACU                                        35
INFORMATION NOT AVAILABLE                   11
AMBULATORY SURGERY TRANSFER                  9
INTERNAL TRANSFER TO OR FROM PSYCH           5
Name: admission_location, dtype: int64

In [16]:
#were they readmitted in the ER
#admissions_combined.drop('readmit_to_emergency', inplace = True)
admissions_combined['readmit_to_emergency'] = np.where(admissions_combined['admission_location_shifted'].notnull() & admissions_combined['admission_location_shifted'].str.contains('EMERGENCY'), 1, 0)
print(admissions_combined.readmit_to_emergency.value_counts())
emergency_df = admissions_combined[['hadm_id', 'readmit_to_emergency']]
admissions = admissions.merge(emergency_df, how ='left', on = ['hadm_id'])

admissions[['subject_id', 'readmit_to_emergency', 'admission_location']].head()

0    278279
1    246241
Name: readmit_to_emergency, dtype: int64


Unnamed: 0,subject_id,readmit_to_emergency,admission_location
0,10000980,1,EMERGENCY ROOM
1,10000980,1,EMERGENCY ROOM
2,10000980,0,EMERGENCY ROOM
3,10000980,1,WALK-IN/SELF REFERRAL
4,10002131,1,EMERGENCY ROOM


In [17]:
admissions.fillna(0, inplace = True)
admissions.head()

Unnamed: 0.1,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,...,edouttime,hospital_expire_flag,Unnamed: 0,chronic_dialysis,heart_failure,readmission_num,readmission_hours,was_readmitted,readmitted_hf,readmit_to_emergency
0,10000980,24947999,2190-11-06 20:57:00,2190-11-08 15:58:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2190-11-06 23:16:00,0,2245805.0,False,1,3,3506.833333,True,0.0,1
1,10000980,25911675,2191-05-23 15:33:00,2191-05-24 17:14:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2191-05-23 17:56:00,0,2534071.0,False,1,5,1269.116667,True,1.0,1
2,10000980,29659838,2191-07-16 14:21:00,2191-07-19 13:03:00,0,EW EMER.,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,...,2191-07-16 16:22:00,0,2603822.0,False,1,6,18179.966667,True,1.0,0
3,10000980,20897796,2193-08-15 01:01:00,2193-08-17 15:07:00,0,OBSERVATION ADMIT,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Other,ENGLISH,...,2193-08-15 02:22:00,0,3658009.0,False,1,7,-0.0,False,0.0,1
4,10002131,24065018,2128-03-17 14:53:00,2128-03-19 16:25:00,0,EW EMER.,EMERGENCY ROOM,HOSPICE,Medicare,ENGLISH,...,2128-03-17 17:05:00,0,2729330.0,False,1,3,0.0,False,0.0,1


In [18]:
# sanity check to make sure that admit_to_emergency is only true when admit location is ER
admissions[admissions.readmit_to_emergency == 1].admission_location.value_counts(dropna=False)

EMERGENCY ROOM                            2547
TRANSFER FROM HOSPITAL                     796
PHYSICIAN REFERRAL                         748
WALK-IN/SELF REFERRAL                      141
TRANSFER FROM SKILLED NURSING FACILITY     122
CLINIC REFERRAL                             61
PROCEDURE SITE                              28
PACU                                        18
INFORMATION NOT AVAILABLE                    5
AMBULATORY SURGERY TRANSFER                  3
INTERNAL TRANSFER TO OR FROM PSYCH           2
Name: admission_location, dtype: int64

In [19]:
admissions.readmit_to_emergency.value_counts(dropna=False)

0    7143
1    4471
Name: readmit_to_emergency, dtype: int64

# Expanding to 8 Output Channels

Output Channels to Add

1) 48h readmission due to HF exacerbation

2) 14-day readmission due to HF exacerbation

3) 30-day readmission due to HF exacerbation

4) ED visit due to HF exacerbation

5) 48h readmission due to any reason

6) 14-day readmission due to any reason

7) 30-day readmission due to any reason

8) ED visit due to any reason


In [20]:
admissions['48h_hf'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['48h_hf'] == 1), 1, 0)
admissions['48h_hf'].value_counts()

0    11548
1       66
Name: 48h_hf, dtype: int64

In [21]:
admissions['14d_hf'] = np.where((admissions.readmission_hours <=14*24) & (admissions.readmission_hours >0), 1, 0)
admissions['14d_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['14d_hf'] == 1), 1, 0)
admissions['14d_hf'].value_counts()

0    10927
1      687
Name: 14d_hf, dtype: int64

In [22]:
admissions['30d_hf'] = np.where((admissions.readmission_hours <=30*24) & (admissions.readmission_hours >0), 1, 0)
admissions['30d_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions['30d_hf'] == 1), 1, 0)
admissions['30d_hf'].value_counts()

0    10368
1     1246
Name: 30d_hf, dtype: int64

In [23]:
admissions['er_hf'] = np.where((admissions.readmitted_hf == 1) & (admissions.readmit_to_emergency == 1), 1, 0)
admissions['er_hf'].value_counts()

0    10489
1     1125
Name: er_hf, dtype: int64

In [24]:
admissions['48h'] = np.where((admissions.readmission_hours <=48) & (admissions.readmission_hours >0), 1, 0)
admissions['48h'].value_counts()

0    11414
1      200
Name: 48h, dtype: int64

In [25]:
admissions['14d'] = np.where((admissions.readmission_hours <=14*24)&(admissions.readmission_hours >0), 1, 0)
admissions['14d'].value_counts()

0    10039
1     1575
Name: 14d, dtype: int64

In [26]:
admissions['30d'] = np.where((admissions.readmission_hours <=30*24)&(admissions.readmission_hours >0), 1, 0)
admissions['30d'].value_counts()

0    8898
1    2716
Name: 30d, dtype: int64

In [27]:
admissions['er'] = np.where(admissions.readmit_to_emergency == 1, 1, 0)
admissions['er'].value_counts()

0    7143
1    4471
Name: er, dtype: int64

In [28]:
admissions.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'insurance', 'language', 'marital_status', 'ethnicity', 'edregtime',
       'edouttime', 'hospital_expire_flag', 'Unnamed: 0', 'chronic_dialysis',
       'heart_failure', 'readmission_num', 'readmission_hours',
       'was_readmitted', 'readmitted_hf', 'readmit_to_emergency', '48h_hf',
       '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er'],
      dtype='object')

In [29]:
#dropping unnecessary variables
admissions.drop(['readmitted_hf', 'was_readmitted','readmit_to_emergency', 'Unnamed: 0', 'readmission_hours'], inplace = True, axis = 1)

In [30]:
def print_stats(df, var_name, extra_info = False):
    print('--------------------Current cohort %s --------------------' %var_name)
    print(sum(admissions[var_name]))
    print(len(admissions[var_name]))
    print('Number of patients: %f' %(sum(admissions[var_name])/float(len(admissions[var_name]))))
    if extra_info:
        cohort = df[df[var_name] == 1]
        print('Gender Breakdown: \n%s' %str(cohort.gender.value_counts(normalize=True))+'\n')
        print('Insurance Breakdown: \n%s' %str(cohort.insurance.value_counts(normalize=True))+'\n')
        print('Ethnicity Breakdown: \n%s' %str(cohort.ethnicity.value_counts(normalize=True))+'\n')

In [31]:
#printing final statistics
patients = patients[['subject_id', 'gender']]
stats = admissions.merge(patients, on = ['subject_id'], how = 'left')
print("Total Number of patients %d: " %stats.shape[0])
for var in ['48h_hf', '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er']:
    print_stats(stats, var, extra_info = True)

Total Number of patients 11614: 
--------------------Current cohort 48h_hf --------------------
66
11614
Number of patients: 0.005683
Gender Breakdown: 
M    0.69697
F    0.30303
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.666667
Other       0.287879
Medicaid    0.045455
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                     0.818182
BLACK/AFRICAN AMERICAN    0.075758
HISPANIC/LATINO           0.060606
OTHER                     0.030303
ASIAN                     0.015152
Name: ethnicity, dtype: float64

--------------------Current cohort 14d_hf --------------------
687
11614
Number of patients: 0.059153
Gender Breakdown: 
M    0.544396
F    0.455604
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.573508
Other       0.379913
Medicaid    0.046579
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.701601
BLACK/AFRICAN AMERICAN           0.183406
HISPANIC/LATINO                  0.04803

In [32]:
#printing stats for total population
print('Gender Breakdown: \n%s' %str(stats.gender.value_counts(normalize=True))+'\n')
print('Insurance Breakdown: \n%s' %str(stats.insurance.value_counts(normalize=True))+'\n')
print('Ethnicity Breakdown: \n%s' %str(stats.ethnicity.value_counts(normalize=True))+'\n')
print(admissions.shape)

Gender Breakdown: 
M    0.521267
F    0.478733
Name: gender, dtype: float64

Insurance Breakdown: 
Medicare    0.592475
Other       0.374548
Medicaid    0.032977
Name: insurance, dtype: float64

Ethnicity Breakdown: 
WHITE                            0.690890
BLACK/AFRICAN AMERICAN           0.161185
UNKNOWN                          0.043654
HISPANIC/LATINO                  0.039952
OTHER                            0.035733
ASIAN                            0.025056
AMERICAN INDIAN/ALASKA NATIVE    0.003530
Name: ethnicity, dtype: float64

(11614, 26)


In [33]:
admissions.to_csv('final_cohort_with_outcome_labels.csv')