In [None]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm.autonotebook import tqdm

In [None]:
# Set input path to raw data
mypath_input = "/home/jupyter/datasets/chartevents/ch_events_first_24_hours_ICUSTAY/"

In [None]:
# Select only files with .csv suffix
ch_chunks = list(filter(lambda k: '.csv' in k, os.listdir(mypath_input)))

In [None]:
# check the list of files
ch_chunks

In [None]:
#import episode file

df =pd.read_csv(mypath_input + ch_chunks[0], 
                          dtype={'ROW_ID':'int32',
                                'SUBJECT_ID':'int32',
                                'HADM_ID':'int32',
                                'ICUSTAY_ID': 'object',
                                'ITEMID': 'int32',
                                #'CHARTTIME':'datetime64[ns]',
                                #'STORETIME':'datetime64[ns]',
                                'CGID': 'object',
                                'VALUE':'object',
                                'VALUENUM':'object',
                                'VALUEUOM': 'object',
                                'WARNING':'object',
                                'ERROR':'object',
                                'RESULTSTATUS':'object',
                                'STOPPED':'object',
                                'HOSPITAL_EXPIRE_FLAG': 'int8',
                                 'icu_tdelta':'int'}, 
                          parse_dates = ['CHARTTIME','STORETIME','ICU_INTIME'])


In [None]:
# fill missing values in all columns with 'NaN'
df = df.fillna('NaN')

In [None]:
df.head(10)

In [None]:
# import D_ITEMS file for extracting labels 
df_items = pd.read_csv("/home/jupyter/datasets/raw/D_ITEMS.csv")

In [None]:
# Merge CHARTEVENTS and D_ITEM
df = df.merge(df_items[['ITEMID','LABEL']], left_on='ITEMID',right_on='ITEMID')

In [None]:
# Convert each element in VALUE, VALUENUM, VALUEUOM into string
df['VALUENUM'] = df['VALUENUM'].map(lambda x: str(x))
df['VALUE'] = df['VALUE'].map(lambda x: str(x))
df['VALUEUOM'] = df['VALUEUOM'].map(lambda x: str(x))

In [None]:
# Concatenate VALUENUM and VALUEUOM
df['VALUE_NUM_UOM'] = df['VALUENUM']+'-'+df['VALUEUOM']

In [None]:
# Drop all records with missing STORETIME
df = df.dropna(subset=['STORETIME'])

In [None]:
df.head(10)

In [None]:
# if VALUENUM and VALUEUOM are not missing then replace VALUE with VALUE_NUM_UOM 
df.loc[(df['VALUENUM'] != 'NaN') & (df['VALUEUOM'] != 'NaN'),['VALUE']] = df['VALUE_NUM_UOM']

# if VALUE is missing and VALUENUM is not missing then replace VALUE with VALUENUM  
df.loc[(df['VALUE'] == 'NaN' ) & (df['VALUENUM'] != 'NaN'),['VALUE']] = df['VALUENUM']


In [None]:
# Concatenate ITEMID with 
df['ITEMID'] = df['ITEMID'].map(lambda x: str(x))
df['VALUE'] = df['ITEMID']+'-'+df['VALUE']

In [None]:
df.head()

In [None]:
# Drop Columns
df = df.drop(labels=['ROW_ID','CHARTTIME','CGID','VALUENUM','VALUEUOM','WARNING','ERROR','RESULTSTATUS','STOPPED','ICU_INTIME','VALUE_NUM_UOM','ITEMID'],axis='columns')

In [None]:
# drop duplicate rows
df = df.drop_duplicates()

In [None]:
df.head(10)

In [None]:
# Remove special charecters from columns LABEL
df['LABEL'] = df['LABEL'].str.replace('[!"#$%&()*+,;<=>?@[\]^_`{|}~\n-]', ' ')

In [None]:
# Split elements in LABEL on whitespace
df['LABEL'] = df['LABEL'].map(lambda x: x.split())

In [None]:
df.head(10)

In [None]:
# Convert to lists
df['VALUE'] = df['VALUE'].map(lambda x: [x])

In [None]:
# Add columns
df['event'] = df['LABEL']+df['VALUE']

In [None]:
df.head()

In [None]:
# Drop labels 
df = df.drop(labels=['VALUE','LABEL'], axis='columns')

In [None]:

df.head()

In [None]:
# convert STORETIME to string to save date correctly in JSON
df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

In [None]:
# export the dataframe to JSON format
#df.to_json('../datasets/testdir/test.json',orient = 'records')

In [None]:
#pd.read_json('../datasets/testdir/test.json',orient = 'records')

Code to iterate and tranform all episode files in the way displayed above

In [None]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm.autonotebook import tqdm

In [None]:
df_items = pd.read_csv("/home/jupyter/datasets/raw/D_ITEMS.csv")

In [None]:
mypath_input = "/home/jupyter/datasets/chartevents/ch_events_first_24_hours_ICUSTAY/"
mypath_output = "/home/jupyter/datasets/chartevents/tokenized/ch_events_chunks_ungrouped/"
import os
os.makedirs(mypath_output, exist_ok=True)

In [None]:
chunk_list = list(filter(lambda k: '.csv' in k, os.listdir(mypath_input)))

In [None]:
chunk_list

In [None]:
%%time
for i in tqdm(chunk_list):
    #import episode file

    df =pd.read_csv(mypath_input + i, 
                              dtype={'ROW_ID':'int32',
                                    'SUBJECT_ID':'int32',
                                    'HADM_ID':'int32',
                                    'ICUSTAY_ID': 'object',
                                    'ITEMID': 'int32',
                                    #'CHARTTIME':'datetime64[ns]',
                                    #'STORETIME':'datetime64[ns]',
                                    'CGID': 'object',
                                    'VALUE':'object',
                                    'VALUENUM':'object',
                                    'VALUEUOM': 'object',
                                    'WARNING':'object',
                                    'ERROR':'object',
                                    'RESULTSTATUS':'object',
                                    'STOPPED':'object',
                                    'HOSPITAL_EXPIRE_FLAG': 'int8',
                                     'icu_tdelta':'int'}, 
                              parse_dates = ['CHARTTIME','STORETIME','ICU_INTIME'])


    # fill missing values in all columns with 'NaN'
    df = df.fillna('NaN')

    # Merge CHARTEVENTS and D_ITEM
    df = df.merge(df_items[['ITEMID','LABEL']], left_on='ITEMID',right_on='ITEMID')

    # Convert each element in VALUE, VALUENUM, VALUEUOM into string
    df['VALUENUM'] = df['VALUENUM'].map(lambda x: str(x))
    df['VALUE'] = df['VALUE'].map(lambda x: str(x))
    df['VALUEUOM'] = df['VALUEUOM'].map(lambda x: str(x))

    # Concatenate VALUENUM and VALUEUOM
    df['VALUE_NUM_UOM'] = df['VALUENUM']+'-'+df['VALUEUOM']

    # Drop all records with missing STORETIME
    df = df.dropna(subset=['STORETIME'])

    # if VALUENUM and VALUEUOM are not missing then replace VALUE with VALUE_NUM_UOM 
    df.loc[(df['VALUENUM'] != 'NaN') & (df['VALUEUOM'] != 'NaN'),['VALUE']] = df['VALUE_NUM_UOM']

    # if VALUE is missing and VALUENUM is not missing then replace VALUE with VALUENUM  
    df.loc[(df['VALUE'] == 'NaN' ) & (df['VALUENUM'] != 'NaN'),['VALUE']] = df['VALUENUM']
    
    # Concatenate ITEMID with 
    df['ITEMID'] = df['ITEMID'].map(lambda x: str(x))
    df['VALUE'] = df['ITEMID']+'-'+df['VALUE']

    # Drop Columns
    df = df.drop(labels=['ROW_ID','CHARTTIME','CGID','VALUENUM','VALUEUOM','WARNING','ERROR','RESULTSTATUS','STOPPED','ICU_INTIME','VALUE_NUM_UOM','ITEMID'],axis='columns')

    # drop duplicate rows
    df = df.drop_duplicates()

    # Remove special charecters from columns LABEL
    df['LABEL'] = df['LABEL'].str.replace('[!"#$%&()*+,;<=>?@[\]^_`{|}~\n-]', ' ')

    # Split elements in LABEL on whitespace
    df['LABEL'] = df['LABEL'].map(lambda x: x.split())

    # Convert to lists
    df['VALUE'] = df['VALUE'].map(lambda x: [x])

    # Add columns
    df['event'] = df['LABEL']+df['VALUE']

    # Drop labels 
    df = df.drop(labels=['VALUE','LABEL'], axis='columns')

    # convert STORETIME to string to save date correctly in JSON
    df['STORETIME'] = df['STORETIME'].map(lambda x: str(x))

    # export the dataframe to JSON format
    df.to_json(mypath_output+i.replace('.csv','.json'),orient = 'records')


In [None]:
import pandas as pd
mypath_output = "/home/jupyter/datasets/chartevents/tokenized/ch_events_chunks_ungrouped/"
pd.read_json(mypath_output+'ch_events_24H_icu_000000000000.json').head()