In [1]:
import pandas as pd
import numpy as np
import pickle

DEMOGRAPHICS

In [2]:
def race_str(x):
    if 'ASIAN' in x:
        return 'RACE/ASIAN'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'WHITE' in x:
        return 'RACE/WHITE'
    elif 'HISPANIC' in x:
        return 'RACE/HISPANIC'
    elif 'BLACK' in x:
        return 'RACE/BLACK'
    elif 'NATIVE' in x:
        return 'RACE/AMERICAN'
    elif x == 'PORTUGUESE': return 'RACE/PORTUGUESE'
    elif x == 'SOUTH AMERICAN': return 'RACE/SOUTH'
    elif x in ['unknown', 'MULTIPLE RACE/ETHNICITY']:
        return 'RACE/UNKNOWN'

In [3]:
AGES = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
AGE_LABELS = [f'Age/{AGES[i]}-{AGES[i+1]}' for i in range(0, len(AGES)-1)]
def age_str(x):
    ind = np.searchsorted(AGES, x, 'right')
    return AGE_LABELS[ind-1]

In [5]:
demo_df = pd.read_parquet('data/mortality/demographics.parquet')

In [6]:
demo_df['ethnicity'] = demo_df['ethnicity'].fillna(0)
demo_df['ethnicity_str'] = demo_df['ethnicity'].apply(lambda x: f'Ethnicity/{x}')

In [7]:
demo_df['gender_str'] = 'Gender/' + demo_df['gender']
demo_df['race_str'] = demo_df['race'].apply(race_str).astype(str)

In [8]:
demo_df['age_str'] = demo_df['age'].apply(lambda x: age_str(x) if not np.isnan(x) else None)
demo_df['age_str'] = demo_df['age_str'].fillna('Age')

In [14]:
demo_df.to_parquet('data/mortality/demographics.parquet')

LABEL

In [6]:
df = pd.read_csv('data/mortality/mortality_labels.csv', usecols=['subject_id', 'stay_id', 'y_true', 'intime', 'outtime', 'deathtime'])

In [None]:
df = df.rename(columns={'y_true': 'Label'})
df['Time'] = df.apply(lambda x: x['deathtime'] if x['Label'] else x['outtime'], axis=1)
df = df.drop(columns=['deathtime', 'outtime'])

In [17]:
df.to_csv('data/mortality/mortality_labels.csv', index=False)

VITALS

In [None]:
with open('data/buckets/chartevents_buckets.pkl', 'rb') as file:
    chart_buckets = pickle.load(file)

with open('data/buckets/inputevents_buckets.pkl', 'rb') as file:
    input_buckets = pickle.load(file)

with open('data/buckets/labevents_buckets.pkl', 'rb') as file:
    lab_buckets = pickle.load(file)

with open('data/buckets/outputevents_buckets.pkl', 'rb') as file:
    output_buckets = pickle.load(file)

with open('data/buckets/procedureevents_buckets.pkl', 'rb') as file:
    procedure_events = pickle.load(file)

In [12]:
icu_buckets = {**chart_buckets, **input_buckets, **output_buckets, **procedure_events}
hosp_buckets = {**lab_buckets}

In [14]:
with open(f'icu_buckets.pkl', 'wb') as f:
    pickle.dump(icu_buckets, f)

with open(f'hosp_buckets.pkl', 'wb') as f:
    pickle.dump(hosp_buckets, f)

EVENTVAL

In [2]:
def bucket_eventval(event, val, d):
    event = int(event)
    if event in d.keys():
        buckets = d[event]
        if type(buckets) == str: 
            return buckets
        ind = np.searchsorted(buckets, val, side='right')
        if ind == len(buckets):
            eventval = f"{event}|{buckets[ind-1]}-"
        else:
            eventval = f"{event}|{buckets[ind-1]}-{buckets[ind]}"
        return eventval
    else:
        print(event)
        return f'{event}'

In [None]:
def bucket_ind(event, val, d):
    buckets = d[event]
    if type(buckets) == str: 
        return 0
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [3]:
with open('data/buckets/hosp_buckets.pkl', 'rb') as file:
    hosp_buckets = pickle.load(file)

with open('data/buckets/icu_buckets.pkl', 'rb') as file:
    icu_buckets = pickle.load(file)

In [None]:
for i in range(0, 37):
    print(i)
    df = pd.read_parquet(f'data/mortality/{i}_final.parquet')
    if not 'hadm_id' in df.columns: 
        continue
    df = df.drop(columns=['hadm_id'])
    df = df.dropna(subset=['value'])

    stay_df = df[~df['stay_id'].isna()]
    hosp_df = df[df['stay_id'].isna()]

    stay_df['eventval'] = stay_df.apply(lambda x: bucket_eventval(x['itemid'], x['value'], icu_buckets), axis=1)
    hosp_df['eventval'] = hosp_df.apply(lambda x: bucket_eventval(x['itemid'], x['value'], hosp_buckets), axis=1)
    df = pd.concat([stay_df, hosp_df])

    df.to_parquet(f'data/mortality/{i}_final.parquet')