In [1]:
import pandas as pd
import os
from tqdm.autonotebook import tqdm



In [2]:
df = pd.read_json("~/datasets/labevents/tokenized/labevents.json", orient = "records", dtype="object")

In [5]:
df.head()

Unnamed: 0,CHARTTIME,HADM_ID,SUBJECT_ID,labevents
160,2101-10-20 16:40:00,145834,3,"[Anion, Gap, Blood, Chemistry, 50868-17-mEq/L]"
161,2101-10-20 16:40:00,145834,3,"[Bicarbonate, Blood, Chemistry, 50882-25-mEq/L]"
162,2101-10-20 16:40:00,145834,3,"[Calcium,, Total, Blood, Chemistry, 50893-8.2-..."
163,2101-10-20 16:40:00,145834,3,"[Chloride, Blood, Chemistry, 50902-99-mEq/L, a..."
164,2101-10-20 16:40:00,145834,3,"[Creatine, Kinase, (CK), Blood, Chemistry, 509..."


In [6]:
#drop records with no HADM_ID
df = df[df['HADM_ID']!='NaN']

In [7]:
df.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22245034 entries, 160 to 27854054
Data columns (total 4 columns):
CHARTTIME     22245034 non-null object
HADM_ID       22245034 non-null object
SUBJECT_ID    22245034 non-null object
labevents     22245034 non-null object
dtypes: object(4)
memory usage: 848.6+ MB


In [8]:
df_ICU = pd.read_csv("~/datasets/raw/ICUSTAYS.csv", dtype = "object")

In [9]:
df_ICU.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.249
1,366,269,106296,206613,carevue,MICU,MICU,52,52,2170-11-05 11:05:29,2170-11-08 17:46:57,3.2788
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
3,368,271,173727,249196,carevue,MICU,SICU,52,23,2120-08-07 23:12:42,2120-08-10 00:39:04,2.06
4,369,272,164716,210407,carevue,CCU,CCU,57,57,2186-12-25 21:08:04,2186-12-27 12:01:13,1.6202


In [10]:
df_ICU.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61532 entries, 0 to 61531
Data columns (total 12 columns):
ROW_ID            61532 non-null object
SUBJECT_ID        61532 non-null object
HADM_ID           61532 non-null object
ICUSTAY_ID        61532 non-null object
DBSOURCE          61532 non-null object
FIRST_CAREUNIT    61532 non-null object
LAST_CAREUNIT     61532 non-null object
FIRST_WARDID      61532 non-null object
LAST_WARDID       61532 non-null object
INTIME            61532 non-null object
OUTTIME           61522 non-null object
LOS               61522 non-null object
dtypes: object(12)
memory usage: 5.6+ MB


In [11]:
# Selecting list of HADM_ID with single ICUSTAY
df_single_ICU = df_ICU.groupby(by=['HADM_ID'], as_index=False).agg({'ICUSTAY_ID': 'count'})
df_single_ICU  = df_single_ICU[df_single_ICU['ICUSTAY_ID']==1]['HADM_ID']

In [12]:
#number of HADM_ID with single ICUSTAY
len(df_single_ICU)

54526

In [13]:
# Dropping HADM_ID with mutiple ICUSTAYS
df = df[df['HADM_ID'].isin(df_single_ICU)].reset_index(drop = True)

In [14]:
df.info(verbose = True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18677080 entries, 0 to 18677079
Data columns (total 4 columns):
CHARTTIME     18677080 non-null object
HADM_ID       18677080 non-null object
SUBJECT_ID    18677080 non-null object
labevents     18677080 non-null object
dtypes: object(4)
memory usage: 570.0+ MB


In [15]:
df = df.merge(df_ICU[['HADM_ID','INTIME']], how='left',left_on='HADM_ID', right_on='HADM_ID')

In [16]:
df = df.rename(columns={'INTIME':'ICU_INTIME'})

In [17]:
df.head()

Unnamed: 0,CHARTTIME,HADM_ID,SUBJECT_ID,labevents,ICU_INTIME
0,2101-10-20 16:40:00,145834,3,"[Anion, Gap, Blood, Chemistry, 50868-17-mEq/L]",2101-10-20 19:10:11
1,2101-10-20 16:40:00,145834,3,"[Bicarbonate, Blood, Chemistry, 50882-25-mEq/L]",2101-10-20 19:10:11
2,2101-10-20 16:40:00,145834,3,"[Calcium,, Total, Blood, Chemistry, 50893-8.2-...",2101-10-20 19:10:11
3,2101-10-20 16:40:00,145834,3,"[Chloride, Blood, Chemistry, 50902-99-mEq/L, a...",2101-10-20 19:10:11
4,2101-10-20 16:40:00,145834,3,"[Creatine, Kinase, (CK), Blood, Chemistry, 509...",2101-10-20 19:10:11


In [18]:
df = df.fillna('NaN')

In [19]:
df = df.rename(columns={'CHARTTIME':'STORETIME'})

In [20]:
df['STORETIME'] = pd.to_datetime(df['STORETIME'], errors='coerce')
df['ICU_INTIME'] = pd.to_datetime(df['ICU_INTIME'], errors='coerce')

In [21]:
%%time
df['icu_tdelta'] = df['STORETIME'] - df['ICU_INTIME'] 
df['icu_tdelta'] = df['icu_tdelta'].map(lambda x: x.total_seconds())

CPU times: user 2min 51s, sys: 4.74 s, total: 2min 56s
Wall time: 2min 56s


In [22]:
df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

In [23]:
df.head()

Unnamed: 0,STORETIME,HADM_ID,SUBJECT_ID,labevents,ICU_INTIME,icu_tdelta
0,2101-10-20 16:40:00,145834,3,"[Anion, Gap, Blood, Chemistry, 50868-17-mEq/L]",2101-10-20 19:10:11,-9011.0
1,2101-10-20 16:40:00,145834,3,"[Bicarbonate, Blood, Chemistry, 50882-25-mEq/L]",2101-10-20 19:10:11,-9011.0
2,2101-10-20 16:40:00,145834,3,"[Calcium,, Total, Blood, Chemistry, 50893-8.2-...",2101-10-20 19:10:11,-9011.0
3,2101-10-20 16:40:00,145834,3,"[Chloride, Blood, Chemistry, 50902-99-mEq/L, a...",2101-10-20 19:10:11,-9011.0
4,2101-10-20 16:40:00,145834,3,"[Creatine, Kinase, (CK), Blood, Chemistry, 509...",2101-10-20 19:10:11,-9011.0


In [24]:
sec_24hrs = 24*60*60 #total seconds in 24 hrs
sec_48hrs = 48*60*60 #total seconds in 48 hrs

In [25]:
print("Total Records: " + str(len(df)))
print("Total Records Prior to 24hrs in ICU: " +str(len(df[df['icu_tdelta']<sec_24hrs])))
print("Total Records Prior to 48hrs in ICU: " +str(len(df[df['icu_tdelta']<sec_48hrs])))

Total Records: 18677080
Total Records Prior to 24hrs in ICU: 7401015
Total Records Prior to 48hrs in ICU: 9306358


In [26]:
# create output path
mypath_output = "/home/jupyter/datasets/data_before_24hrs_icu/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [27]:
# extract data recorded in ans prior to first 24hrs of ICU stay
df[df['icu_tdelta']<sec_24hrs].to_json(mypath_output+"labevents.json", orient = 'records')

In [28]:
# create output path
mypath_output = "/home/jupyter/datasets/data_before_48hrs_icu/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [29]:
# extract data recorded in ans prior to first 48hrs of ICU stay
df[df['icu_tdelta']<sec_48hrs].to_json(mypath_output+"labevents.json", orient = 'records')