In [1]:
import pandas as pd
import os
from tqdm.autonotebook import tqdm



In [2]:
df = pd.read_json("~/datasets/microbioevents/tokenized/microbioevents.json", orient = "records", dtype="object")

In [3]:
df.head()

Unnamed: 0,CHARTTIME,HADM_ID,SUBJECT_ID,microbioevents
0,2156-04-13 14:18:00,170324,96,"[BRONCHOALVEOLAR, LAVAGE, PSEUDOMONAS, AERUGIN..."
1,2156-04-20 13:10:00,170324,96,"[SPUTUM, Negative, Culture]"
2,2156-04-20 16:00:00,170324,96,"[BLOOD, CULTURE, Negative, Culture]"
3,2156-04-21 14:00:00,170324,96,"[SPUTUM, Negative, Culture]"
4,2196-09-27 00:00:00,175533,101,"[BLOOD, CULTURE, Negative, Culture]"


In [4]:
#drop records with no HADM_ID
df = df[df['HADM_ID']!='NaN']

In [5]:
df.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 589954 entries, 0 to 589953
Data columns (total 4 columns):
CHARTTIME         589954 non-null object
HADM_ID           589954 non-null object
SUBJECT_ID        589954 non-null object
microbioevents    589954 non-null object
dtypes: object(4)
memory usage: 22.5+ MB


In [6]:
df_ICU = pd.read_csv("~/datasets/raw/ICUSTAYS.csv", dtype = "object")

In [7]:
df_ICU.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.249
1,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,368,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.06
4,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202


In [8]:
df_ICU.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61532 entries, 0 to 61531
Data columns (total 12 columns):
ROW_ID            61532 non-null object
SUBJECT_ID        61532 non-null object
HADM_ID           61532 non-null object
ICUSTAY_ID        61532 non-null object
DBSOURCE          61532 non-null object
FIRST_CAREUNIT    61532 non-null object
LAST_CAREUNIT     61532 non-null object
FIRST_WARDID      61532 non-null object
LAST_WARDID       61532 non-null object
INTIME            61532 non-null object
OUTTIME           61522 non-null object
LOS               61522 non-null object
dtypes: object(12)
memory usage: 5.6+ MB


In [9]:
# Selecting list of HADM_ID with single ICUSTAY
df_single_ICU = df_ICU.groupby(by=['HADM_ID'], as_index=False).agg({'ICUSTAY_ID': 'count'})
df_single_ICU  = df_single_ICU[df_single_ICU['ICUSTAY_ID']==1]['HADM_ID']

In [10]:
#number of HADM_ID with single ICUSTAY
len(df_single_ICU)

54526

In [11]:
# Dropping HADM_ID with mutiple ICUSTAYS
df = df[df['HADM_ID'].isin(df_single_ICU)].reset_index(drop = True)

In [12]:
df.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486697 entries, 0 to 486696
Data columns (total 4 columns):
CHARTTIME         486697 non-null object
HADM_ID           486697 non-null object
SUBJECT_ID        486697 non-null object
microbioevents    486697 non-null object
dtypes: object(4)
memory usage: 14.9+ MB


In [13]:
df = df.merge(df_ICU[['HADM_ID','INTIME']], how='left',left_on='HADM_ID', right_on='HADM_ID')

In [14]:
df = df.rename(columns={'INTIME':'ICU_INTIME'})

In [15]:
df.head()

Unnamed: 0,CHARTTIME,HADM_ID,SUBJECT_ID,microbioevents,ICU_INTIME
0,2156-04-13 14:18:00,170324,96,"[BRONCHOALVEOLAR, LAVAGE, PSEUDOMONAS, AERUGIN...",2156-03-31 16:11:34
1,2156-04-20 13:10:00,170324,96,"[SPUTUM, Negative, Culture]",2156-03-31 16:11:34
2,2156-04-20 16:00:00,170324,96,"[BLOOD, CULTURE, Negative, Culture]",2156-03-31 16:11:34
3,2156-04-21 14:00:00,170324,96,"[SPUTUM, Negative, Culture]",2156-03-31 16:11:34
4,2196-09-27 00:00:00,175533,101,"[BLOOD, CULTURE, Negative, Culture]",2196-09-26 18:37:40


In [16]:
df = df.fillna('NaN')

In [17]:
df = df.rename(columns={'CHARTTIME':'STORETIME'})

In [18]:
df['STORETIME'] = pd.to_datetime(df['STORETIME'], errors='coerce')
df['ICU_INTIME'] = pd.to_datetime(df['ICU_INTIME'], errors='coerce')

In [19]:
%%time
df['icu_tdelta'] = df['STORETIME'] - df['ICU_INTIME']
df['icu_tdelta'] = df['icu_tdelta'].map(lambda x: x.total_seconds())

CPU times: user 5.09 s, sys: 96 ms, total: 5.18 s
Wall time: 5.18 s


In [20]:
df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

In [21]:
df.head()

Unnamed: 0,STORETIME,HADM_ID,SUBJECT_ID,microbioevents,ICU_INTIME,icu_tdelta
0,2156-04-13 14:18:00,170324,96,"[BRONCHOALVEOLAR, LAVAGE, PSEUDOMONAS, AERUGIN...",2156-03-31 16:11:34,1116386.0
1,2156-04-20 13:10:00,170324,96,"[SPUTUM, Negative, Culture]",2156-03-31 16:11:34,1717106.0
2,2156-04-20 16:00:00,170324,96,"[BLOOD, CULTURE, Negative, Culture]",2156-03-31 16:11:34,1727306.0
3,2156-04-21 14:00:00,170324,96,"[SPUTUM, Negative, Culture]",2156-03-31 16:11:34,1806506.0
4,2196-09-27 00:00:00,175533,101,"[BLOOD, CULTURE, Negative, Culture]",2196-09-26 18:37:40,19340.0


In [22]:
sec_24hrs = 24*60*60 #total seconds in 24 hrs
sec_48hrs = 48*60*60 #total seconds in 48 hrs

In [23]:
print("Total Records: " + str(len(df)))
print("Total Records Prior to 24hrs in ICU: " +str(len(df[df['icu_tdelta']<sec_24hrs])))
print("Total Records Prior to 48hrs in ICU: " +str(len(df[df['icu_tdelta']<sec_48hrs])))

Total Records: 486697
Total Records Prior to 24hrs in ICU: 210616
Total Records Prior to 48hrs in ICU: 246668


In [24]:
# create output path
mypath_output = "/home/jupyter/datasets//data_before_24hrs_icu/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [25]:
# extract data recorded in ans prior to first 2hhrs of ICU stay
df[df['icu_tdelta']<sec_24hrs].to_json(mypath_output+"microbioevents.json", orient = 'records')

In [26]:
# create output path
mypath_output = "/home/jupyter/datasets/data_before_48hrs_icu/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [27]:
# extract data recorded in ans prior to first 2hhrs of ICU stay
df[df['icu_tdelta']<sec_48hrs].to_json(mypath_output+"microbioevents.json", orient = 'records')