In [7]:
import pandas as pd
import numpy as np
from dateutil import parser

## Load data

In [100]:
base_dir = 'C:/Users/suzie/Dropbox (MIT)/Spring 2021/6.871 Machine Learning for Healthcare'

In [102]:
metadata = pd.read_csv(base_dir+'/mimic-cxr-reports/mimic-cxr-2.0.0-metadata.csv')
metadata.head()

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


In [104]:
admissions = pd.read_csv(base_dir+'/mimic-iv-1.0/core/admissions.csv', low_memory=False)
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag
0,14679932,21038362,2139-09-26 14:16:00,2139-09-28 11:30:00,,ELECTIVE,,HOME,Other,ENGLISH,SINGLE,UNKNOWN,,,0
1,15585972,24941086,2123-10-07 23:56:00,2123-10-12 11:22:00,,ELECTIVE,,HOME,Other,ENGLISH,,WHITE,,,0
2,11989120,21965160,2147-01-14 09:00:00,2147-01-17 14:25:00,,ELECTIVE,,HOME,Other,ENGLISH,,UNKNOWN,,,0
3,17817079,24709883,2165-12-27 17:33:00,2165-12-31 21:18:00,,ELECTIVE,,HOME,Other,ENGLISH,,OTHER,,,0
4,15078341,23272159,2122-08-28 08:48:00,2122-08-30 12:32:00,,ELECTIVE,,HOME,Other,ENGLISH,,BLACK/AFRICAN AMERICAN,,,0


In [118]:
cohort = pd.read_csv(base_dir+'/ml4h_chf_readmissions/phase1_teamA/final_cohort.csv')
cohort.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,chronic_dialysis,CHF exacerbation
0,0,11603789,25552978,False,False
1,5,16809467,29803749,False,False
2,10,11147319,25941345,False,False
3,14,12156531,29321418,False,False
4,19,14528388,29429446,False,False


## Preprocessing

### Extract cxr timestamps

In [110]:
# convert StudyDate + StudyTime to datetime objects in 'StudyDateTime'
metadata['StudyTime'] = metadata['StudyTime'].astype(str)
metadata['StudyTime'] = metadata['StudyTime'].apply(lambda t: '0'*(6-t.index('.')) + t)

metadata['StudyDateTime'] = metadata['StudyDate'].astype(str) + 'T' + metadata['StudyTime'].astype(str)
metadata['StudyDateTime'] = metadata['StudyDateTime'].apply(lambda date: parser.parse(date))

### Extract admissions timestamps 

In [106]:
# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)

### Link cxr to hadm (only last study captured for each hadm)

In [119]:
# isolate cohort admissions
admissions = admissions[admissions['subject_id'].isin(cohort['subject_id'].unique())]

In [121]:
len(admissions)

474917

In [155]:
admissions['last_study_id'] = None
admissions['last_dicom_id'] = None
admissions['last_study_time'] = None

In [156]:
# add last study_id/dicom_id to corresponding hadm entries

for entry in metadata.index:
    meta_row = metadata.loc[entry]
    subject_id = meta_row['subject_id']
    time = meta_row['StudyDateTime']
    subject_adm = admissions[admissions['subject_id'] == subject_id]
    
    for adm_entry in subject_adm.index:
        adm_row = subject_adm.loc[adm_entry]
        if adm_row['admittime'] < time < adm_row['dischtime']:
            if adm_row['last_study_time'] == None or adm_row['last_study_time'] < time:
                admissions.at[adm_entry,'last_study_id'] = meta_row['study_id']
                admissions.at[adm_entry,'last_dicom_id'] = meta_row['dicom_id']
                admissions.at[adm_entry,'last_study_time'] = time
    

In [171]:
# drop entries without corresponding dicom/study entries
adm_with_cxr = admissions.drop(admissions[admissions['last_study_id'].isnull()].index)

In [175]:
# drop unnecessary columns
adm_with_cxr.drop(columns=['admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'hospital_expire_flag'], inplace=True)

In [173]:
len(adm_with_cxr)

42191

In [176]:
adm_with_cxr.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,edregtime,edouttime,last_study_id,last_dicom_id,last_study_time
7047,11407769,22195798,2188-06-27 10:31:00,2188-06-28 23:37:00,,,,55011670,0fc1bbed-4e79747c-546c598f-3a3a6fbf-facf88f1,2188-06-28 07:48:16.171000
7617,16434858,21025121,2125-05-07 22:43:00,2125-05-08 17:48:00,,2125-05-07 21:08:00,2125-05-08 17:48:00,57256972,9389b3ec-1093fc25-734a494f-31f75223-b75e6046,2125-05-07 22:44:09.578000
7619,10877472,27957509,2128-12-30 21:49:00,2128-12-31 11:35:00,,2128-12-30 18:04:00,2128-12-30 23:31:00,54371812,512bfb44-bea6b099-932c0ff0-e08b6a9a-171c4681,2128-12-31 09:22:15
7650,12728714,25858882,2183-02-10 04:11:00,2183-02-10 10:15:00,,2183-02-09 23:18:00,2183-02-10 10:15:00,57947040,41109e8c-1d4183df-bd529840-77a484f7-823f2927,2183-02-10 04:57:14.265000
7651,19618308,21917069,2173-10-18 14:22:00,2173-10-22 20:00:00,2173-10-22 04:23:00,2173-10-18 08:55:00,2173-10-18 15:43:00,52949364,596b8b5c-da4e31c8-99ed3290-ac86e7ee-00774e60,2173-10-21 09:10:14.640000
