# Data Loading and Exploration

In [2]:
import pandas as pd
import datetime
from dateutil import parser
import numpy as np

In [3]:
mimiciv_dir = ''

In [10]:
admissions = pd.read_csv(f'{mimiciv_dir}admissions.csv', low_memory=False)
admissions[:10]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
1,14029832,22059088,2120-01-18 01:28:00,2120-01-20 16:13:00,,URGENT,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,,OTHER,,,0
2,14495017,22484010,2175-01-28 15:41:00,2175-01-29 16:00:00,,DIRECT EMER.,PHYSICIAN REFERRAL,HOME,Other,?,,WHITE,,,0
3,13676048,23865469,2193-01-19 05:27:00,2193-01-24 18:59:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,?,MARRIED,WHITE,,,0
4,13831972,27763544,2131-01-27 04:03:00,2131-01-27 05:39:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicaid,ENGLISH,SINGLE,WHITE,2131-01-26 22:19:00,2131-01-27 05:39:00,0
5,18523038,25414328,2142-08-26 17:14:00,2142-08-27 10:00:00,,DIRECT OBSERVATION,PROCEDURE SITE,,Other,ENGLISH,SINGLE,WHITE,,,0
6,16705931,20580522,2174-10-24 11:30:00,2174-10-24 18:45:00,,DIRECT OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,,,0
7,19963742,24951506,2171-07-04 15:58:00,2171-07-05 13:37:00,,AMBULATORY OBSERVATION,PACU,,Other,ENGLISH,SINGLE,UNKNOWN,,,0
8,10903424,22568585,2181-01-31 13:09:00,2181-01-31 13:42:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0
9,15030422,21975876,2149-09-21 23:54:00,2149-09-22 03:59:00,,EU OBSERVATION,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,,,0


In [11]:
patients = pd.read_csv(f'{mimiciv_dir}patients.csv', low_memory=False)
patients[:10]

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10003939,M,0,2184,2008 - 2010,
2,10004222,M,0,2161,2014 - 2016,
3,10005325,F,0,2154,2011 - 2013,
4,10007338,F,0,2153,2017 - 2019,
5,10008101,M,0,2142,2008 - 2010,
6,10009872,F,0,2168,2014 - 2016,
7,10011333,F,0,2132,2014 - 2016,
8,10011879,M,0,2158,2014 - 2016,
9,10012663,F,0,2171,2011 - 2013,


In [12]:
admissions[admissions.subject_id == 12427812].head(40)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,12427812,21593330,2184-01-06 11:51:00,2184-01-10 11:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,,UNKNOWN,,,0
10253,12427812,23948770,2185-01-20 00:08:00,2185-01-21 11:45:00,,EU OBSERVATION,PHYSICIAN REFERRAL,,Other,ENGLISH,MARRIED,WHITE,2185-01-19 18:58:00,2185-01-20 01:34:00,0


In [13]:
d_icd_diagnoses = pd.read_csv(f'{mimiciv_dir}d_icd_diagnoses.csv', low_memory=False)
d_icd_diagnoses[:10]

Unnamed: 0,icd_code,icd_version,long_title
0,10,9,Cholera due to vibrio cholerae
1,11,9,Cholera due to vibrio cholerae el tor
2,19,9,"Cholera, unspecified"
3,20,9,Typhoid fever
4,21,9,Paratyphoid fever A
5,22,9,Paratyphoid fever B
6,23,9,Paratyphoid fever C
7,29,9,"Paratyphoid fever, unspecified"
8,30,9,Salmonella gastroenteritis
9,31,9,Salmonella septicemia


# Adding Readmission Label

In [14]:
# Sort by subject then time (assumes all admittime, dischtime ranges are exclusive)
admissions = admissions.sort_values(by=["subject_id", "admittime"])

# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)

# For temporary ease of understanding, append the array to itself, shifted up one on the right half
admissions_combined = pd.concat([admissions, admissions.shift(-1).rename(lambda x: str(x) + "_shifted", axis="columns")], axis=1)

# Whether or not that visit was before a readmission of that patient (note the last visit is false)
admissions_combined['was_readmitted'] = admissions_combined['subject_id'] == admissions_combined['subject_id_shifted']

# This creates a new column that's the difference in dates between admissions, regardless of if its the same patient
admissions_combined['readmission_days'] = (admissions_combined['admittime_shifted'] - admissions_combined['dischtime']).fillna(pd.Timedelta(seconds=-1))
admissions_combined['readmission_days'] = admissions_combined['readmission_days'].dt.days
admissions_combined['readmission_days'] = admissions_combined['was_readmitted'] * admissions_combined['readmission_days']
admissions['readmission_days'] = admissions_combined['readmission_days']
admissions['was_readmitted'] = admissions_combined['was_readmitted']

# All hadm_ids belonging to people who were readmitted, fitler to last 10 for brevity of printing
admissions[(admissions['was_readmitted']==True) | (admissions.shift(-1)['was_readmitted']==True)][-10:]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,readmission_days,was_readmitted
187950,19999784,27192150,2120-01-26 00:00:00,2120-02-01 12:15:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,53,True
186117,19999784,27319264,2120-03-26 09:53:00,2120-04-01 13:00:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,56,True
187424,19999784,29324445,2120-05-28 09:07:00,2120-06-02 08:55:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,54,True
168474,19999784,23406899,2120-07-26 09:18:00,2120-07-31 18:15:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,85,True
178803,19999784,29889147,2120-10-25 09:43:00,2120-10-31 09:00:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,91,True
179787,19999784,29956342,2121-01-31 00:00:00,2121-02-05 12:44:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,113,True
187505,19999784,24755486,2121-05-30 09:22:00,2121-06-05 08:48:00,,ELECTIVE,PHYSICIAN REFERRAL,HOME,Medicaid,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,0,False
172700,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,,EW EMER.,PHYSICIAN REFERRAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2147-07-17 17:18:00,2147-07-18 17:34:00,0,522,True
175560,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,,EW EMER.,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,SINGLE,WHITE,2149-01-08 09:11:00,2149-01-08 18:12:00,0,0,False
387927,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,,EW EMER.,EMERGENCY ROOM,HOME,Other,ENGLISH,WIDOWED,WHITE,2164-07-24 21:16:00,2164-07-25 01:20:00,0,44,True


In [90]:
# adding in an admission number because maybe we only want to look at first readmission
admissions['counter'] = 1
admissions = admissions.sort_values(by=["subject_id", "admittime"])
admissions['admission_num'] = admissions.groupby(['subject_id']).counter.cumsum() 

In [148]:
# sanity checks to make sure that readmission_num flag is correct
admissions[admissions.subject_id == 12427812][['admission_num']][-10:]
admissions[admissions.readmission_num>3]
admissions[admissions.subject_id == 10000032].head()

AttributeError: 'DataFrame' object has no attribute 'readmission_num'

# Adding in Heart Failure Information

In [24]:
##inner join admissions with diagnoses_icd
diagnosis_icd = pd.read_csv('diagnoses_icd.csv', low_memory=False)
#diagnosis_icd = diagnosis_icd.iloc[0:100]
admissions = pd.merge(left = admissions, right = diagnosis_icd, how = 'inner', on = ['subject_id','hadm_id'], suffixes =('_adm', '_diag'))

#from Group A
chf_icd = pd.read_csv('CHF_ICD.csv', low_memory=False)

# for comparison
chf_icd['icd_code'] = chf_icd['icd_code'].apply(str)
admissions = admissions.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
chf_icd = chf_icd.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
#merge admissions table containing readmission days and icd_codes with appropriate icd_code for heart failure
#admissions_icd = pd.merge(left = admissions_icd, right = chf_icd, how = 'inner', on = ['icd_code','icd_version'], suffixes =('_adm', '_diag'))
admissions['heart_failure'] = admissions['icd_code'].isin(chf_icd['icd_code']).values
print(admissions['heart_failure'])


FINAL TABLE
       subject_id   hadm_id           admittime           dischtime deathtime  \
0        10000980  29654838 2188-01-03 17:41:00 2188-01-05 17:30:00       NaN   
1        10001877  21320596 2150-11-21 23:02:00 2150-11-23 16:46:00       NaN   
2        10003502  20459702 2166-02-15 13:06:00 2166-02-19 16:02:00       NaN   
3        10004401  23920883 2144-04-21 20:29:00 2144-05-01 13:00:00       NaN   
4        10011668  22181970 2131-06-14 15:26:00 2131-06-24 15:10:00       NaN   
...           ...       ...                 ...                 ...       ...   
53940    18711213  24429199 2179-12-14 07:15:00 2179-12-18 16:31:00       NaN   
53941    19363982  24138750 2190-07-08 16:33:00 2190-07-14 13:20:00       NaN   
53942    19389226  20347016 2140-07-16 16:20:00 2140-07-25 14:10:00       NaN   
53943    19815670  22369823 2139-08-24 17:34:00 2139-08-25 17:30:00       NaN   
53944    19855099  27428447 2169-05-29 16:16:00 2169-06-11 15:35:00       NaN   

               

# Adding in Emergency Department Information

In [25]:
admissions.admission_location.value_counts()

EMERGENCY ROOM                            38275
PHYSICIAN REFERRAL                         6483
TRANSFER FROM HOSPITAL                     5621
PROCEDURE SITE                             1473
CLINIC REFERRAL                             730
TRANSFER FROM SKILLED NURSING FACILITY      698
PACU                                        589
INTERNAL TRANSFER TO OR FROM PSYCH           63
INFORMATION NOT AVAILABLE                     2
Name: admission_location, dtype: int64

In [26]:
admissions['admit_to_emergency'] = np.where(admissions.admission_location.notnull() & admissions.admission_location.str.contains('EMERGENCY'), 1, 0)
admissions[['subject_id', 'admit_to_emergency', 'admission_location']].head()

Unnamed: 0,subject_id,admit_to_emergency,admission_location
0,10000980,1,EMERGENCY ROOM
1,10001877,1,EMERGENCY ROOM
2,10003502,1,EMERGENCY ROOM
3,10004401,1,EMERGENCY ROOM
4,10011668,1,EMERGENCY ROOM


In [27]:
# sanity check to make sure that admit_to_emergency is only true when admit location is ER
admissions[admissions.admit_to_emergency == 1].admission_location.value_counts(dropna=False)

EMERGENCY ROOM    38275
Name: admission_location, dtype: int64

In [28]:
admissions.admit_to_emergency.value_counts(dropna=False)

1    38275
0    15670
Name: admit_to_emergency, dtype: int64

# Expanding to 8 Output Channels

Output Channels to Add

1) 48h readmission due to HF exacerbation

2) 14-day readmission due to HF exacerbation

3) 30-day readmission due to HF exacerbation

4) ED visit due to HF exacerbation

5) 48h readmission due to any reason

6) 14-day readmission due to any reason

7) 30-day readmission due to any reason

8) ED visit due to any reason


In [29]:
admissions['48h_hf'] = np.where((admissions.heart_failure == 1) & (admissions.readmission_days <=2), 1, 0)
admissions['48h_hf'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'heart_failure'

In [30]:
admissions['14d_hf'] = np.where((admissions.heart_failure == 1) & (admissions.readmission_days <=14), 1, 0)
admissions['14d_hf'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'heart_failure'

In [31]:
admissions['30d_hf'] = np.where((admissions.heart_failure == 1) & (admissions.readmission_days <=30), 1, 0)
admissions['30d_hf'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'heart_failure'

In [32]:
admissions['er_hf'] = np.where((admissions.heart_failure == 1) & (admissions.admit_to_emergency == 1), 1, 0)
admissions['er_hf'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'heart_failure'

In [108]:
admissions['48h'] = np.where(admissions.readmission_days <=2, 1, 0)
admissions['48h'].value_counts()

1    280119
0    244401
Name: 48h, dtype: int64

In [109]:
admissions['14d'] = np.where(admissions.readmission_days <=14, 1, 0)
admissions['14d'].value_counts()

1    322456
0    202064
Name: 14d, dtype: int64

In [110]:
admissions['30d'] = np.where(admissions.readmission_days <=30, 1, 0)
admissions['30d'].value_counts()

1    351961
0    172559
Name: 30d, dtype: int64

In [111]:
admissions['er'] = np.where(admissions.admit_to_emergency == 1, 1, 0)
admissions['er'].value_counts()

0    278279
1    246241
Name: er, dtype: int64

In [145]:
#dropping unnecessary variables
admissions.drop(['heart_failure', 'admit_to_emergency', 'readmission_days'], inplace = True, axis = 1)

In [143]:
def print_stats(df, var_name, extra_info = False):
    print('--------------------Current cohort %s --------------------' %var_name)
    print('Number of patients: %d' %sum(admissions[var_name]))
    if extra_info:
        cohort = df[df[var_name] == 1]
        print('Gender Breakdown: \n%s' %str(cohort.gender.value_counts())+'\n')
        print('Insurance Breakdown: \n%s' %str(cohort.insurance.value_counts())+'\n')
        print('Ethnicity Breakdown: \n%s' %str(cohort.ethnicity.value_counts())+'\n')

In [147]:
#printing final statistics
patients = patients[['subject_id', 'gender']]
stats = admissions.merge(patients, on = ['subject_id'], how = 'left')
print("Total Number of patients %d: " %stats.shape[0])
for var in ['48h_hf', '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er']:
    print_stats(stats, var, extra_info = True)

Total Number of patients 524520: 
--------------------Current cohort 48h_hf --------------------
Number of patients: 139856
Gender Breakdown: 
F    72189
M    67667
Name: gender, dtype: int64

Insurance Breakdown: 
Other       92254
Medicare    35137
Medicaid    12465
Name: insurance, dtype: int64

Ethnicity Breakdown: 
WHITE                            87527
BLACK/AFRICAN AMERICAN           17906
ASIAN                             8631
OTHER                             8476
UNKNOWN                           8074
HISPANIC/LATINO                   7320
UNABLE TO OBTAIN                  1408
AMERICAN INDIAN/ALASKA NATIVE      514
Name: ethnicity, dtype: int64

--------------------Current cohort 14d_hf --------------------
Number of patients: 161156
Gender Breakdown: 
F    82457
M    78699
Name: gender, dtype: int64

Insurance Breakdown: 
Other       102441
Medicare     43863
Medicaid     14852
Name: insurance, dtype: int64

Ethnicity Breakdown: 
WHITE                            102103
BLAC