In [51]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm.autonotebook import tqdm

In [52]:
# Set input path to raw data
mypath_input = "../datasets/chartevents/ch_events_first_24_hours_ICUSTAY/"

In [53]:
# Select only files with .csv suffix
ch_chunks = list(filter(lambda k: '.csv' in k, os.listdir(mypath_input)))

In [54]:
# check the list of files
ch_chunks

['ch_events_24H_icu_000000000009.csv',
 'ch_events_24H_icu_000000000008.csv',
 'ch_events_24H_icu_000000000004.csv',
 'ch_events_24H_icu_000000000006.csv',
 'ch_events_24H_icu_000000000002.csv',
 'ch_events_24H_icu_000000000003.csv',
 'ch_events_24H_icu_000000000001.csv',
 'ch_events_24H_icu_000000000007.csv',
 'ch_events_24H_icu_000000000005.csv',
 'ch_events_24H_icu_000000000000.csv']

In [55]:
#import episode file

df =pd.read_csv(mypath_input + ch_chunks[0], 
                          dtype={'ROW_ID':'int32',
                                'SUBJECT_ID':'int32',
                                'HADM_ID':'int32',
                                'ICUSTAY_ID': 'object',
                                'ITEMID': 'int32',
                                #'CHARTTIME':'datetime64[ns]',
                                #'STORETIME':'datetime64[ns]',
                                'CGID': 'object',
                                'VALUE':'object',
                                'VALUENUM':'object',
                                'VALUEUOM': 'object',
                                'WARNING':'object',
                                'ERROR':'object',
                                'RESULTSTATUS':'object',
                                'STOPPED':'object',
                                'HOSPITAL_EXPIRE_FLAG': 'int8',
                                 'icu_tdelta':'int'}, 
                          parse_dates = ['CHARTTIME','STORETIME','ICU_INTIME'])


In [56]:
# fill missing values in all columns with 'NaN'
df = df.fillna('NaN')

In [57]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,ICU_INTIME,HOSPITAL_EXPIRE_FLAG,icu_tdelta
0,93142551,11462,111499,206268,3634,2129-01-07 00:30:00,2129-01-07 02:36:00,17679,,,,,,,NotStopd,2129-01-06 15:53:00,0,643
1,106620567,14185,128357,292549,1087,2131-04-03 10:00:00,2131-04-03 10:53:00,15674,Non-verbal cues,,,,,,NotStopd,2131-04-02 11:27:22,0,1406
2,43746577,1809,103739,277602,94,2194-05-29 23:00:00,2194-05-30 00:14:00,19279,Serosanguinous,,,,,,NotStopd,2194-05-29 07:41:01,0,993
3,45118454,2111,168493,275243,417,2100-09-25 19:50:00,2100-09-25 19:41:00,19889,,,Deg. C,,,,NotStopd,2100-09-24 20:10:09,0,1411
4,172599465,27501,165524,267414,663,2106-07-19 15:00:00,2106-07-19 15:18:00,17600,Spouse,,,,,,NotStopd,2106-07-19 10:41:20,0,277
5,304549673,73409,161287,216410,224080,2117-04-17 03:00:00,2117-04-17 03:27:00,17062,30 Degrees,,,0,0,,,2117-04-16 18:19:23,0,548
6,190166645,32081,148171,219631,31,2150-04-12 15:00:00,2150-04-12 17:26:00,14441,Bedrest,,,,,,NotStopd,2150-04-12 08:00:32,0,566
7,146353035,22129,178957,295153,674,2147-07-25 11:00:00,2147-07-25 11:12:00,19528,Oral,,,,,,NotStopd,2147-07-24 20:41:21,0,871
8,60459192,5093,144709,206425,140,2112-10-31 21:05:00,2112-10-31 21:10:00,19889,No,,,,,,NotStopd,2112-10-31 08:45:56,0,745
9,183069736,30153,176169,265082,722,2176-04-06 15:10:00,2176-04-06 15:12:00,21319,Drager,,,,,,NotStopd,2176-04-05 18:39:54,0,1233


In [59]:
# import D_ITEMS file for extracting labels 
df_items = pd.read_csv("../datasets/raw/D_ITEMS.csv")

In [60]:
# Merge CHARTEVENTS and D_ITEM
df = df.merge(df_items[['ITEMID','LABEL']], left_on='ITEMID',right_on='ITEMID')

In [61]:
# Convert each element in VALUE, VALUENUM, VALUEUOM into string
df['VALUENUM'] = df['VALUENUM'].map(lambda x: str(x))
df['VALUE'] = df['VALUE'].map(lambda x: str(x))
df['VALUEUOM'] = df['VALUEUOM'].map(lambda x: str(x))

In [62]:
# Concatenate VALUENUM and VALUEUOM
df['VALUE_NUM_UOM'] = df['VALUENUM']+'-'+df['VALUEUOM']

In [63]:
# Drop all records with missing STORETIME
df = df.dropna(subset=['STORETIME'])

In [64]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,ICU_INTIME,HOSPITAL_EXPIRE_FLAG,icu_tdelta,LABEL,VALUE_NUM_UOM
0,93142551,11462,111499,206268,3634,2129-01-07 00:30:00,2129-01-07 02:36:00,17679,,,,,,,NotStopd,2129-01-06 15:53:00,0,643,Stress,NaN-NaN
1,149331803,22813,112931,213230,3634,2175-06-08 09:00:00,2175-06-08 15:31:00,19271,,,,,,,NotStopd,2175-06-08 10:06:51,0,325,Stress,NaN-NaN
2,114331044,15743,112419,280782,3634,2122-01-17 12:00:00,2122-01-17 12:41:00,17949,,,,,,,NotStopd,2122-01-16 14:05:47,0,1356,Stress,NaN-NaN
3,55605288,4119,139557,202111,3634,2193-04-11 00:00:00,2193-04-11 01:33:00,14872,,,,,,,NotStopd,2193-04-10 19:35:00,0,358,Stress,NaN-NaN
4,177717990,28766,158478,233423,3634,2156-02-25 13:00:00,2156-02-25 13:33:00,15242,,,,,,,NotStopd,2156-02-25 06:50:41,0,403,Stress,NaN-NaN
5,97689602,12354,142059,232691,3634,2119-08-24 00:00:00,2119-08-24 00:17:00,15312,,,,,,,NotStopd,2119-08-23 15:15:13,0,542,Stress,NaN-NaN
6,87292076,10285,172811,270256,3634,2134-03-09 16:00:00,2134-03-09 16:09:00,14282,,,,,,,NotStopd,2134-03-09 12:03:00,0,246,Stress,NaN-NaN
7,103318281,13527,140508,288975,3634,2179-03-24 22:00:00,2179-03-24 22:00:00,14098,,,,,,,NotStopd,2179-03-24 09:30:02,0,750,Stress,NaN-NaN
8,162284247,25427,102097,200704,3634,2158-03-21 11:00:00,2158-03-21 13:07:00,14489,,,,,,,NotStopd,2158-03-21 05:44:29,0,443,Stress,NaN-NaN
9,100534208,12884,162732,208468,3634,2134-04-15 07:00:00,2134-04-15 07:02:00,15522,,,,,,,NotStopd,2134-04-15 01:33:17,0,329,Stress,NaN-NaN


In [65]:
# if VALUENUM and VALUEUOM are not missing then replace VALUE with VALUE_NUM_UOM 
df.loc[(df['VALUENUM'] != 'NaN') & (df['VALUEUOM'] != 'NaN'),['VALUE']] = df['VALUE_NUM_UOM']

# if VALUE is missing and VALUENUM is not missing then replace VALUE with VALUENUM  
df.loc[(df['VALUE'] == 'NaN' ) & (df['VALUENUM'] != 'NaN'),['VALUE']] = df['VALUENUM']


In [69]:
# Concatenate ITEMID with 
df['ITEMID'] = df['ITEMID'].map(lambda x: str(x))
df['VALUE'] = df['ITEMID']+'-'+df['VALUE']

In [70]:
df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED,ICU_INTIME,HOSPITAL_EXPIRE_FLAG,icu_tdelta,LABEL,VALUE_NUM_UOM
0,93142551,11462,111499,206268,3634,2129-01-07 00:30:00,2129-01-07 02:36:00,17679,3634-None,,,,,,NotStopd,2129-01-06 15:53:00,0,643,Stress,NaN-NaN
1,149331803,22813,112931,213230,3634,2175-06-08 09:00:00,2175-06-08 15:31:00,19271,3634-None,,,,,,NotStopd,2175-06-08 10:06:51,0,325,Stress,NaN-NaN
2,114331044,15743,112419,280782,3634,2122-01-17 12:00:00,2122-01-17 12:41:00,17949,3634-None,,,,,,NotStopd,2122-01-16 14:05:47,0,1356,Stress,NaN-NaN
3,55605288,4119,139557,202111,3634,2193-04-11 00:00:00,2193-04-11 01:33:00,14872,3634-None,,,,,,NotStopd,2193-04-10 19:35:00,0,358,Stress,NaN-NaN
4,177717990,28766,158478,233423,3634,2156-02-25 13:00:00,2156-02-25 13:33:00,15242,3634-None,,,,,,NotStopd,2156-02-25 06:50:41,0,403,Stress,NaN-NaN


In [71]:
# Drop Columns
df = df.drop(labels=['ROW_ID','CHARTTIME','CGID','VALUENUM','VALUEUOM','WARNING','ERROR','RESULTSTATUS','STOPPED','ICU_INTIME','VALUE_NUM_UOM','ITEMID'],axis='columns')

In [72]:
# drop duplicate rows
df = df.drop_duplicates()

In [73]:
df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STORETIME,VALUE,HOSPITAL_EXPIRE_FLAG,icu_tdelta,LABEL
0,11462,111499,206268,2129-01-07 02:36:00,3634-None,0,643,Stress
1,22813,112931,213230,2175-06-08 15:31:00,3634-None,0,325,Stress
2,15743,112419,280782,2122-01-17 12:41:00,3634-None,0,1356,Stress
3,4119,139557,202111,2193-04-11 01:33:00,3634-None,0,358,Stress
4,28766,158478,233423,2156-02-25 13:33:00,3634-None,0,403,Stress
5,12354,142059,232691,2119-08-24 00:17:00,3634-None,0,542,Stress
6,10285,172811,270256,2134-03-09 16:09:00,3634-None,0,246,Stress
7,13527,140508,288975,2179-03-24 22:00:00,3634-None,0,750,Stress
8,25427,102097,200704,2158-03-21 13:07:00,3634-None,0,443,Stress
9,12884,162732,208468,2134-04-15 07:02:00,3634-None,0,329,Stress


In [74]:
# Remove special charecters from columns LABEL
df['LABEL'] = df['LABEL'].str.replace('[!"#$%&()*+,;<=>?@[\]^_`{|}~\n-]', ' ')

In [75]:
# Split elements in LABEL on whitespace
df['LABEL'] = df['LABEL'].map(lambda x: x.split())

In [76]:
df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STORETIME,VALUE,HOSPITAL_EXPIRE_FLAG,icu_tdelta,LABEL
0,11462,111499,206268,2129-01-07 02:36:00,3634-None,0,643,[Stress]
1,22813,112931,213230,2175-06-08 15:31:00,3634-None,0,325,[Stress]
2,15743,112419,280782,2122-01-17 12:41:00,3634-None,0,1356,[Stress]
3,4119,139557,202111,2193-04-11 01:33:00,3634-None,0,358,[Stress]
4,28766,158478,233423,2156-02-25 13:33:00,3634-None,0,403,[Stress]
5,12354,142059,232691,2119-08-24 00:17:00,3634-None,0,542,[Stress]
6,10285,172811,270256,2134-03-09 16:09:00,3634-None,0,246,[Stress]
7,13527,140508,288975,2179-03-24 22:00:00,3634-None,0,750,[Stress]
8,25427,102097,200704,2158-03-21 13:07:00,3634-None,0,443,[Stress]
9,12884,162732,208468,2134-04-15 07:02:00,3634-None,0,329,[Stress]


In [77]:
# Convert to lists
df['VALUE'] = df['VALUE'].map(lambda x: [x])

In [78]:
# Add columns
df['event'] = df['LABEL']+df['VALUE']

In [79]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STORETIME,VALUE,HOSPITAL_EXPIRE_FLAG,icu_tdelta,LABEL,event
0,11462,111499,206268,2129-01-07 02:36:00,[3634-None],0,643,[Stress],"[Stress, 3634-None]"
1,22813,112931,213230,2175-06-08 15:31:00,[3634-None],0,325,[Stress],"[Stress, 3634-None]"
2,15743,112419,280782,2122-01-17 12:41:00,[3634-None],0,1356,[Stress],"[Stress, 3634-None]"
3,4119,139557,202111,2193-04-11 01:33:00,[3634-None],0,358,[Stress],"[Stress, 3634-None]"
4,28766,158478,233423,2156-02-25 13:33:00,[3634-None],0,403,[Stress],"[Stress, 3634-None]"


In [80]:
# Drop labels 
df = df.drop(labels=['VALUE','LABEL'], axis='columns')

In [81]:

df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,STORETIME,HOSPITAL_EXPIRE_FLAG,icu_tdelta,event
0,11462,111499,206268,2129-01-07 02:36:00,0,643,"[Stress, 3634-None]"
1,22813,112931,213230,2175-06-08 15:31:00,0,325,"[Stress, 3634-None]"
2,15743,112419,280782,2122-01-17 12:41:00,0,1356,"[Stress, 3634-None]"
3,4119,139557,202111,2193-04-11 01:33:00,0,358,"[Stress, 3634-None]"
4,28766,158478,233423,2156-02-25 13:33:00,0,403,"[Stress, 3634-None]"
5,12354,142059,232691,2119-08-24 00:17:00,0,542,"[Stress, 3634-None]"
6,10285,172811,270256,2134-03-09 16:09:00,0,246,"[Stress, 3634-None]"
7,13527,140508,288975,2179-03-24 22:00:00,0,750,"[Stress, 3634-None]"
8,25427,102097,200704,2158-03-21 13:07:00,0,443,"[Stress, 3634-None]"
9,12884,162732,208468,2134-04-15 07:02:00,0,329,"[Stress, 3634-None]"


In [82]:
# convert STORETIME to string to save date correctly in JSON
df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

In [83]:
# export the dataframe to JSON format
df.to_json('../datasets/testdir/test.json',orient = 'records')

In [84]:
pd.read_json('../datasets/testdir/test.json',orient = 'records')

Unnamed: 0,HADM_ID,HOSPITAL_EXPIRE_FLAG,ICUSTAY_ID,STORETIME,SUBJECT_ID,event,icu_tdelta
0,111499,0,206268,2129-01-07 02:36:00,11462,"[Stress, 3634-None]",643
1,112931,0,213230,2175-06-08 15:31:00,22813,"[Stress, 3634-None]",325
2,112419,0,280782,2122-01-17 12:41:00,15743,"[Stress, 3634-None]",1356
3,139557,0,202111,2193-04-11 01:33:00,4119,"[Stress, 3634-None]",358
4,158478,0,233423,2156-02-25 13:33:00,28766,"[Stress, 3634-None]",403
5,142059,0,232691,2119-08-24 00:17:00,12354,"[Stress, 3634-None]",542
6,172811,0,270256,2134-03-09 16:09:00,10285,"[Stress, 3634-None]",246
7,140508,0,288975,2179-03-24 22:00:00,13527,"[Stress, 3634-None]",750
8,102097,0,200704,2158-03-21 13:07:00,25427,"[Stress, 3634-None]",443
9,162732,0,208468,2134-04-15 07:02:00,12884,"[Stress, 3634-None]",329


Code to iterate and tranform all episode files in the way displayed above

In [92]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm.autonotebook import tqdm

In [93]:
df_items = pd.read_csv("../datasets/raw/D_ITEMS.csv")

In [94]:
mypath_input = "../datasets/chartevents/ch_events_first_24_hours_ICUSTAY/"
mypath_output = "../datasets/chartevents/tokenized/ch_events_chunks_ungrouped/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [95]:
chunk_list = list(filter(lambda k: '.csv' in k, os.listdir(mypath_input)))

In [96]:
chunk_list

['ch_events_24H_icu_000000000009.csv',
 'ch_events_24H_icu_000000000008.csv',
 'ch_events_24H_icu_000000000004.csv',
 'ch_events_24H_icu_000000000006.csv',
 'ch_events_24H_icu_000000000002.csv',
 'ch_events_24H_icu_000000000003.csv',
 'ch_events_24H_icu_000000000001.csv',
 'ch_events_24H_icu_000000000007.csv',
 'ch_events_24H_icu_000000000005.csv',
 'ch_events_24H_icu_000000000000.csv']

In [97]:

for i in tqdm(chunk_list):
    #import episode file

    df =pd.read_csv(mypath_input + i, 
                              dtype={'ROW_ID':'int32',
                                    'SUBJECT_ID':'int32',
                                    'HADM_ID':'int32',
                                    'ICUSTAY_ID': 'object',
                                    'ITEMID': 'int32',
                                    #'CHARTTIME':'datetime64[ns]',
                                    #'STORETIME':'datetime64[ns]',
                                    'CGID': 'object',
                                    'VALUE':'object',
                                    'VALUENUM':'object',
                                    'VALUEUOM': 'object',
                                    'WARNING':'object',
                                    'ERROR':'object',
                                    'RESULTSTATUS':'object',
                                    'STOPPED':'object',
                                    'HOSPITAL_EXPIRE_FLAG': 'int8',
                                     'icu_tdelta':'int'}, 
                              parse_dates = ['CHARTTIME','STORETIME','ICU_INTIME'])


    # fill missing values in all columns with 'NaN'
    df = df.fillna('NaN')

    # Merge CHARTEVENTS and D_ITEM
    df = df.merge(df_items[['ITEMID','LABEL']], left_on='ITEMID',right_on='ITEMID')

    # Convert each element in VALUE, VALUENUM, VALUEUOM into string
    df['VALUENUM'] = df['VALUENUM'].map(lambda x: str(x))
    df['VALUE'] = df['VALUE'].map(lambda x: str(x))
    df['VALUEUOM'] = df['VALUEUOM'].map(lambda x: str(x))

    # Concatenate VALUENUM and VALUEUOM
    df['VALUE_NUM_UOM'] = df['VALUENUM']+'-'+df['VALUEUOM']

    # Drop all records with missing STORETIME
    df = df.dropna(subset=['STORETIME'])

    # if VALUENUM and VALUEUOM are not missing then replace VALUE with VALUE_NUM_UOM 
    df.loc[(df['VALUENUM'] != 'NaN') & (df['VALUEUOM'] != 'NaN'),['VALUE']] = df['VALUE_NUM_UOM']

    # if VALUE is missing and VALUENUM is not missing then replace VALUE with VALUENUM  
    df.loc[(df['VALUE'] == 'NaN' ) & (df['VALUENUM'] != 'NaN'),['VALUE']] = df['VALUENUM']
    
    # Concatenate ITEMID with 
    df['ITEMID'] = df['ITEMID'].map(lambda x: str(x))
    df['VALUE'] = df['ITEMID']+'-'+df['VALUE']

    # Drop Columns
    df = df.drop(labels=['ROW_ID','CHARTTIME','CGID','VALUENUM','VALUEUOM','WARNING','ERROR','RESULTSTATUS','STOPPED','ICU_INTIME','VALUE_NUM_UOM','ITEMID'],axis='columns')

    # drop duplicate rows
    df = df.drop_duplicates()

    # Remove special charecters from columns LABEL
    df['LABEL'] = df['LABEL'].str.replace('[!"#$%&()*+,;<=>?@[\]^_`{|}~\n-]', ' ')

    # Split elements in LABEL on whitespace
    df['LABEL'] = df['LABEL'].map(lambda x: x.split())

    # Convert to lists
    df['VALUE'] = df['VALUE'].map(lambda x: [x])

    # Add columns
    df['event'] = df['LABEL']+df['VALUE']

    # Drop labels 
    df = df.drop(labels=['VALUE','LABEL'], axis='columns')

    # convert STORETIME to string to save date correctly in JSON
    df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

    # export the dataframe to JSON format
    df.to_json(mypath_output+i.replace('.csv','.json'),orient = 'records')


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [98]:
pd.read_json(mypath_output+'ch_events_24H_icu_000000000000.json')

Unnamed: 0,HADM_ID,HOSPITAL_EXPIRE_FLAG,ICUSTAY_ID,STORETIME,SUBJECT_ID,event,icu_tdelta
0,149127,0,296858,2137-01-11 16:09:00,6171,"[Sputum, Consistency, 8476-Thick]",363
1,115029,0,206034,2179-05-15 12:25:00,19467,"[Sputum, Consistency, 8476-Thin]",675
2,189849,0,284336,2166-05-14 20:53:00,23209,"[Sputum, Consistency, 8476-Thick]",625
3,138179,1,279963,2143-11-02 12:19:00,5077,"[Sputum, Consistency, 8476-Thick]",687
4,166908,0,255465,2144-12-02 04:30:00,29905,"[Sputum, Consistency, 8476-Thick]",856
5,143840,1,294376,2136-06-12 16:15:00,26387,"[Sputum, Consistency, 8476-Thick]",1415
6,199251,0,223416,2153-11-22 11:19:00,19173,"[Sputum, Consistency, 8476-Thick]",1099
7,184487,0,255145,2141-10-09 09:35:00,9253,"[Sputum, Consistency, 8476-Tenacious]",1150
8,126143,0,269523,2167-06-27 17:11:00,28247,"[Sputum, Consistency, 8476-Thick]",995
9,164234,0,226422,2103-12-09 19:51:00,16964,"[Sputum, Consistency, 8476-Thick]",878
