In [1]:
import pandas as pd
import numpy as np
import pickle

ED BUCKETS

In [2]:
vital_df = pd.read_csv('data/mimiciv/3.1/ed/vitalsign.csv')
triage_df = pd.read_csv('data/mimiciv/3.1/ed/triage.csv')
med_df = pd.read_csv('data/mimiciv/3.1/ed/pyxis.csv')

In [3]:
NUM_COLS = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']

In [4]:
num_df = pd.concat([vital_df[NUM_COLS], triage_df[NUM_COLS]])

In [5]:
scale = 10_000

In [6]:
buckets = {}
for c in num_df.columns:
    df_c = pd.to_numeric(num_df[c], 'coerce').dropna()
    bucket = np.percentile(df_c, np.arange(0, 110, 10), method='lower')
    bucket = bucket[np.insert(np.diff(bucket) != 0, 0, True)]
    bucket = np.array([
            np.floor(bucket[0] * scale) / scale,             # floor first
            *[round(b, ndigits=4) for b in bucket[1:-1]],         # round middle
            np.ceil(bucket[-1] * scale) / scale              # ceil last
            ])
    buckets[c] = bucket[np.concatenate(([True], bucket[1:] != bucket[:-1]))]

In [7]:
buckets['acuity'] = [1, 2, 3, 4, 5]

In [9]:
with open(f'ed_buckets.pkl', 'wb') as f:
    pickle.dump(buckets, f)

ED: DATASET

In [None]:
AGES = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]
AGE_LABELS = [f'Age/{AGES[i]}-{AGES[i+1]}' for i in range(0, len(AGES)-1)]
def age_str(x):
    ind = np.searchsorted(AGES, x, 'right')
    return AGE_LABELS[ind-1]

In [None]:
def race_str(x):
    if 'ASIAN' in x:
        return 'RACE/ASIAN'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'WHITE' in x:
        return 'RACE/WHITE'
    elif 'HISPANIC' in x:
        return 'RACE/HISPANIC'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'NATIVE' in x:
        return 'RACE/AMERICAN'
    elif x == 'PORTUGUESE': return 'RACE/PORTUGUESE'
    elif x == 'SOUTH AMERICAN': return 'RACE/SOUTH'
    else:
        return 'RACE/UNKNOWN'

In [None]:
demo_df = pd.read_csv('data/transfer/demo.csv')
demo_df = demo_df.drop(columns=['hadm_id', 'anchor_age', 'anchor_year', 'ethnicity'])

In [None]:
demo_df['race_str'] = demo_df['race'].apply(race_str)
demo_df['gender_str'] = 'Gender/' + demo_df['gender']
demo_df['age_str'] = demo_df['age'].apply(lambda x: age_str(x) if not np.isnan(x) else None)

In [None]:
demo_df.to_csv('data/transfer/demo.csv', index=False)

In [None]:
df = pd.read_csv('data/transfer/med.csv', index_col=0)

In [None]:
df = df[['subject_id', 'stay_id', 'charttime', 'group']]

In [None]:
df['eventval'] = 'MED/' + df['group'].astype(str)

In [None]:
df.to_csv('data/transfer/med.csv', index=False)

In [None]:
def bucket_eventval(event, val, d):
    buckets = d[event]
    if type(buckets) == str: 
        return buckets
    ind = np.searchsorted(buckets, val, side='right')
    if ind == len(buckets):
        eventval = f"{event}|{buckets[ind-1]}-"
    else:
        eventval = f"{event}|{buckets[ind-1]}-{buckets[ind]}"
    return eventval

In [None]:
def bucket_ind(event, val, d):
    buckets = d[event]
    if type(buckets) == str: 
        return 0
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [None]:
with open('data/buckets/ed_buckets.pkl', 'rb') as file:
    ed_buckets = pickle.load(file)

In [None]:
df = pd.read_csv('data/transfer/numerics.csv')

In [None]:
df_pivot = None
for c in df.columns:
    if c == 'stay_id' or c == 'charttime': continue
    df_c = df[['stay_id', 'charttime'] + [c]]
    df_c = df_c.rename(columns={c:'value'})
    df_c['event'] = c
    if df_pivot is None: df_pivot = df_c
    else: df_pivot = pd.concat([df_pivot, df_c])

In [None]:
df_pivot = df_pivot.dropna(subset=['value'])

In [None]:
df_pivot['eventval'] = df_pivot.apply(lambda x: bucket_eventval(x['event'], x['value'], ed_buckets), axis=1)

In [None]:
df_pivot.to_csv('data/transfer/numerics.csv', index=False)

ED: LABEL

In [None]:
df = pd.read_csv('data/transfer/label.csv', usecols=['subject_id', 'stay_id', 'intime', 'outcome_icu_transfer_12h', 'Time'])

In [None]:
df = df.rename(columns={'outcome_icu_transfer_12h': 'Label'})

In [None]:
df.to_csv('data/transfer/label.csv', index=False)

ICU

In [None]:
COLS = {
    'chartevents': ('subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'valuenum'),
    'inputevents': ('subject_id', 'hadm_id', 'stay_id', 'starttime', 'itemid', 'amount'),
    'outputevents': ('subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value'),
    'procedureevents': ('subject_id', 'hadm_id', 'stay_id', 'starttime', 'itemid', 'value'),
    'labevents': ('subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum'),
}


In [18]:
FNAME = 'procedureevents'
VAL_COL = 'value'
scale = 10_000

In [19]:
codes = []
df_final = None
for i in range(0, 7):
    df = pd.read_parquet(f'data/raw_data/{FNAME}_{i}.parquet', columns=['itemid', VAL_COL])
    codes += list(df['itemid'])
    codes = list(set(codes))
    if df_final is None: df_final = df
    else: df_final = pd.concat([df_final, df])
print(len(codes))

159


In [20]:
buckets = {}
for i, c in enumerate(codes):
    if i % 50 == 0:
        print(i)
    df_c = df_final[df_final['itemid']==c]
    df_values = pd.to_numeric(df_c[VAL_COL], 'coerce').dropna()
    if len(df_values) > 0:
        bucket = np.percentile(df_values, np.arange(0, 110, 10), method='lower')
        bucket = bucket[np.insert(np.diff(bucket) != 0, 0, True)]
        bucket = np.array([
                np.floor(bucket[0] * scale) / scale,             # floor first
                *[round(b, ndigits=4) for b in bucket[1:-1]],         # round middle
                np.ceil(bucket[-1] * scale) / scale              # ceil last
                ])
        buckets[c] = bucket[np.concatenate(([True], bucket[1:] != bucket[:-1]))]

0
50
100
150


In [21]:
with open(f'{FNAME}_buckets.pkl', 'wb') as f:
    pickle.dump(buckets, f)

In [None]:
with open('data/buckets/chartevents_buckets.pkl', 'rb') as file:
    chart_buckets = pickle.load(file)

with open('data/buckets/inputevents_buckets.pkl', 'rb') as file:
    input_buckets = pickle.load(file)

with open('data/buckets/labevents_buckets.pkl', 'rb') as file:
    lab_buckets = pickle.load(file)

with open('data/buckets/outputevents_buckets.pkl', 'rb') as file:
    output_buckets = pickle.load(file)

with open('data/buckets/procedureevents_buckets.pkl', 'rb') as file:
    procedure_events = pickle.load(file)

In [None]:
icu_buckets = {**chart_buckets, **input_buckets, **output_buckets, **procedure_events}
hosp_buckets = {**lab_buckets}

In [None]:
with open(f'icu_buckets.pkl', 'wb') as f:
    pickle.dump(icu_buckets, f)

with open(f'hosp_buckets.pkl', 'wb') as f:
    pickle.dump(hosp_buckets, f)

ICU: DATASET

In [None]:
def race_str(x):
    if 'ASIAN' in x:
        return 'RACE/ASIAN'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'WHITE' in x:
        return 'RACE/WHITE'
    elif 'HISPANIC' in x:
        return 'RACE/HISPANIC'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'NATIVE' in x:
        return 'RACE/AMERICAN'
    elif x == 'PORTUGUESE': return 'RACE/PORTUGUESE'
    elif x == 'SOUTH AMERICAN': return 'RACE/SOUTH'
    elif x in ['unknown', 'MULTIPLE RACE/ETHNICITY']:
        return 'RACE/UNKNOWN'

In [None]:
AGES = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
AGE_LABELS = [f'Age/{AGES[i]}-{AGES[i+1]}' for i in range(0, len(AGES)-1)]
def age_str(x):
    ind = np.searchsorted(AGES, x, 'right')
    return AGE_LABELS[ind-1]

In [None]:
demo_df = pd.read_parquet('data/mortality/demographics.parquet')

In [None]:
demo_df['ethnicity'] = demo_df['ethnicity'].fillna(0)
demo_df['ethnicity_str'] = demo_df['ethnicity'].apply(lambda x: f'Ethnicity/{x}')

In [None]:
demo_df['gender_str'] = 'Gender/' + demo_df['gender']
demo_df['race_str'] = demo_df['race'].apply(race_str).astype(str)

In [None]:
demo_df['age_str'] = demo_df['age'].apply(lambda x: age_str(x) if not np.isnan(x) else None)
demo_df['age_str'] = demo_df['age_str'].fillna('Age')

In [None]:
demo_df.to_parquet('data/mortality/demographics.parquet')

In [None]:
def bucket_eventval(event, val, d):
    event = int(event)
    if event in d.keys():
        buckets = d[event]
        if type(buckets) == str: 
            return buckets
        ind = np.searchsorted(buckets, val, side='right')
        if ind == len(buckets):
            eventval = f"{event}|{buckets[ind-1]}-"
        else:
            eventval = f"{event}|{buckets[ind-1]}-{buckets[ind]}"
        return eventval
    else:
        print(event)
        return f'{event}'

In [None]:
def bucket_ind(event, val, d):
    buckets = d[event]
    if type(buckets) == str: 
        return 0
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [None]:
with open('data/buckets/hosp_buckets.pkl', 'rb') as file:
    hosp_buckets = pickle.load(file)

with open('data/buckets/icu_buckets.pkl', 'rb') as file:
    icu_buckets = pickle.load(file)

In [None]:
for i in range(0, 37):
    print(i)
    df = pd.read_parquet(f'data/mortality/{i}_final.parquet')
    if not 'hadm_id' in df.columns: 
        continue
    df = df.drop(columns=['hadm_id'])
    df = df.dropna(subset=['value'])

    stay_df = df[~df['stay_id'].isna()]
    hosp_df = df[df['stay_id'].isna()]

    stay_df['eventval'] = stay_df.apply(lambda x: bucket_eventval(x['itemid'], x['value'], icu_buckets), axis=1)
    hosp_df['eventval'] = hosp_df.apply(lambda x: bucket_eventval(x['itemid'], x['value'], hosp_buckets), axis=1)
    df = pd.concat([stay_df, hosp_df])

    df.to_parquet(f'data/mortality/{i}_final.parquet')

LABEL

In [None]:
df = pd.read_csv('data/mortality/mortality_labels.csv', usecols=['subject_id', 'stay_id', 'y_true', 'intime', 'outtime', 'deathtime'])

In [None]:
df = df.rename(columns={'y_true': 'Label'})
df['Time'] = df.apply(lambda x: x['deathtime'] if x['Label'] else x['outtime'], axis=1)
df = df.drop(columns=['deathtime', 'outtime'])

In [None]:
df.to_csv('data/mortality/mortality_labels.csv', index=False)