In [1]:
import pandas as pd
import numpy as np
from dateutil import parser


In [5]:
base_dir = '/data'

In [10]:
metadata = pd.read_csv('data/mimic-cxr-2.0.0-metadata.csv.gz')
print(metadata.head())

                                       dicom_id  subject_id  study_id  \
0  02aa804e-bde0afdd-112c0b34-7bc16630-4e384014    10000032  50414267   
1  174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962    10000032  50414267   
2  2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab    10000032  53189527   
3  e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c    10000032  53189527   
4  68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714    10000032  53911762   

  PerformedProcedureStepDescription ViewPosition  Rows  Columns  StudyDate  \
0                CHEST (PA AND LAT)           PA  3056     2544   21800506   
1                CHEST (PA AND LAT)      LATERAL  3056     2544   21800506   
2                CHEST (PA AND LAT)           PA  3056     2544   21800626   
3                CHEST (PA AND LAT)      LATERAL  3056     2544   21800626   
4               CHEST (PORTABLE AP)           AP  2705     2539   21800723   

    StudyTime ProcedureCodeSequence_CodeMeaning ViewCodeSequence_CodeMeaning  \
0  213014.53

In [11]:


admissions = pd.read_csv('final_cohort_with_outcome_labels.csv')
print(admissions.head())

   Unnamed: 0  subject_id   hadm_id            admittime            dischtime  \
0           0    10000980  24947999  2190-11-06 20:57:00  2190-11-08 15:58:00   
1           1    10000980  25911675  2191-05-23 15:33:00  2191-05-24 17:14:00   
2           2    10000980  29659838  2191-07-16 14:21:00  2191-07-19 13:03:00   
3           3    10000980  20897796  2193-08-15 01:01:00  2193-08-17 15:07:00   
4           4    10002131  24065018  2128-03-17 14:53:00  2128-03-19 16:25:00   

  deathtime     admission_type     admission_location discharge_location  \
0         0           EW EMER.         EMERGENCY ROOM   HOME HEALTH CARE   
1         0           EW EMER.         EMERGENCY ROOM   HOME HEALTH CARE   
2         0           EW EMER.         EMERGENCY ROOM   HOME HEALTH CARE   
3         0  OBSERVATION ADMIT  WALK-IN/SELF REFERRAL   HOME HEALTH CARE   
4         0           EW EMER.         EMERGENCY ROOM            HOSPICE   

  insurance  ... heart_failure readmission_num 48h_hf 14

In [12]:
# convert StudyDate + StudyTime to datetime objects in 'StudyDateTime'
metadata['StudyTime'] = metadata['StudyTime'].astype(str)
metadata['StudyTime'] = metadata['StudyTime'].apply(lambda t: '0'*(6-t.index('.')) + t)

metadata['StudyDateTime'] = metadata['StudyDate'].astype(str) + 'T' + metadata['StudyTime'].astype(str)
metadata['StudyDateTime'] = metadata['StudyDateTime'].apply(lambda date: parser.parse(date))
print("study time modified")

# ### Extract admissions timestamps 

# In[106]:


# Convert date strings to datetime objects
admissions['admittime'] = admissions['admittime'].map(parser.parse)
admissions['dischtime'] = admissions['dischtime'].map(parser.parse)


# ### Link cxr to hadm (only last study captured for each hadm)

# In[119]:


# isolate cohort admissions
#admissions = admissions[admissions['subject_id'].isin(cohort['subject_id'].unique())]


# In[121]:


len(admissions)


# In[155]:


admissions['last_study_id'] = None
admissions['last_dicom_id'] = None
admissions['last_study_time'] = None


# In[156]:


# add last study_id/dicom_id to corresponding hadm entries
print(len(metadata.index))


study time modified
377110


In [13]:
for entry in metadata.index:
    meta_row = metadata.loc[entry]
    subject_id = meta_row['subject_id']
    time = meta_row['StudyDateTime']
    subject_adm = admissions[admissions['subject_id'] == subject_id]
    
    for adm_entry in subject_adm.index:
        adm_row = subject_adm.loc[adm_entry]
        if adm_row['admittime'] < time < adm_row['dischtime']:
            if adm_row['last_study_time'] == None or adm_row['last_study_time'] < time:
                admissions.at[adm_entry,'last_study_id'] = meta_row['study_id']
                admissions.at[adm_entry,'last_dicom_id'] = meta_row['dicom_id']
                admissions.at[adm_entry,'last_study_time'] = time

# drop entries without corresponding dicom/study entries
adm_with_cxr = admissions.drop(admissions[admissions['last_study_id'].isnull()].index)
print(adm_with_cxr.columns)

Index(['Unnamed: 0', 'subject_id', 'hadm_id', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status',
       'ethnicity', 'edregtime', 'edouttime', 'hospital_expire_flag',
       'chronic_dialysis', 'heart_failure', 'readmission_num', '48h_hf',
       '14d_hf', '30d_hf', 'er_hf', '48h', '14d', '30d', 'er', 'last_study_id',
       'last_dicom_id', 'last_study_time'],
      dtype='object')


In [14]:
# drop unnecessary columns
adm_with_cxr.drop(columns=['admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'ethnicity', 'hospital_expire_flag'], inplace=True)

adm_with_cxr.to_csv('final_cohort_with_imageids.csv')
len(adm_with_cxr)
print(adm_with_cxr.head())


    Unnamed: 0  subject_id   hadm_id           admittime           dischtime  \
7            7    10003400  20214994 2137-02-24 10:00:00 2137-03-19 15:45:00   
8            8    10003502  29011269 2169-08-26 16:14:00 2169-08-28 15:20:00   
16          16    10011938  22624746 2128-01-02 21:04:00 2128-01-11 16:01:00   
17          17    10011938  23501236 2128-02-11 16:11:00 2128-02-26 16:29:00   
18          18    10012853  26369609 2175-04-05 15:36:00 2175-04-10 16:55:00   

   deathtime            edregtime            edouttime  chronic_dialysis  \
7          0                    0                    0             False   
8          0  2169-08-26 12:31:00  2169-08-26 22:11:00             False   
16         0                    0                    0             False   
17         0  2128-02-11 12:35:00  2128-02-11 17:34:00             False   
18         0  2175-04-05 06:22:00  2175-04-05 17:10:00             False   

    heart_failure  ...  14d_hf  30d_hf  er_hf  48h  14d  30d  

In [15]:
pip install Pillow

Note: you may need to restart the kernel to use updated packages.


In [16]:
from PIL import Image

In [17]:
im1 = Image.open(r'02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg')
im1.save(r'02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.png')

In [18]:
class MimicID:
	subject_id = ''
	study_id = ''
	dicom_id = ''

	def __init__(self, subject_id, study_id, dicom_id):
		self.subject_id = str(subject_id)
		self.study_id = str(study_id)
		self.dicom_id = str(dicom_id)

	def __str__(self):
		return f"p{self.subject_id}_s{self.study_id}_{self.dicom_id}"


In [32]:
#adm_with_cxr['subject_id'].iloc[0], adm_with_cxr['study_id'].iloc[0],adm_with_cxr['dicom_id'].iloc[0]

In [30]:
adm_with_cxr['mimic_id'] = adm_with_cxr.apply(lambda row: \
            MimicID(row['subject_id'], row['last_study_id'], row['last_dicom_id']), axis=1)

In [31]:
adm_with_cxr.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,edregtime,edouttime,chronic_dialysis,heart_failure,...,30d_hf,er_hf,48h,14d,30d,er,last_study_id,last_dicom_id,last_study_time,mimic_id
7,7,10003400,20214994,2137-02-24 10:00:00,2137-03-19 15:45:00,0,0,0,False,1,...,0,0,0,0,0,0,52437868,2f6a5fc9-40af95f8-d8762332-f56005ea-b5f85cc2,2137-03-09 20:29:58.453000,p10003400_s52437868_2f6a5fc9-40af95f8-d8762332...
8,8,10003502,29011269,2169-08-26 16:14:00,2169-08-28 15:20:00,0,2169-08-26 12:31:00,2169-08-26 22:11:00,False,1,...,0,0,0,0,0,0,50084553,70d7e600-373c1311-929f5ff9-23ee3621-ff551ff9,2169-08-27 08:16:56.250000,p10003502_s50084553_70d7e600-373c1311-929f5ff9...
16,16,10011938,22624746,2128-01-02 21:04:00,2128-01-11 16:01:00,0,0,0,False,1,...,0,1,0,0,0,1,56362279,c6eed867-d6efb38a-438501f9-9d2506e9-a0c958f8,2128-01-03 09:36:34,p10011938_s56362279_c6eed867-d6efb38a-438501f9...
17,17,10011938,23501236,2128-02-11 16:11:00,2128-02-26 16:29:00,0,2128-02-11 12:35:00,2128-02-11 17:34:00,False,1,...,0,0,0,0,0,1,51895247,bf724128-9131f33a-6fd065d5-19041750-9e7f8707,2128-02-16 09:07:57.093000,p10011938_s51895247_bf724128-9131f33a-6fd065d5...
18,18,10012853,26369609,2175-04-05 15:36:00,2175-04-10 16:55:00,0,2175-04-05 06:22:00,2175-04-05 17:10:00,False,1,...,0,0,0,0,0,1,58181999,aac02704-c647b84c-58d5df12-c9857852-e1536bba,2175-04-07 13:18:41.031000,p10012853_s58181999_aac02704-c647b84c-58d5df12...


In [33]:
adm_with_cxr['mimic_id'].iloc[0]

<__main__.MimicID at 0x7fda259c0940>

In [34]:
adm_with_cxr.to_csv('final_cohort_with_mimicids.csv')