In [2]:
# Patients who will experience clinical deterioration
# new onset among patients with initially normal vital signs (first 15 minutes)

# tachycardia: HR > 110
# hypotension: MAP < 65
# hypoxia: SpO2 < 90

In [1]:
import os
import pandas as pd
import pickle
import numpy as np

In [2]:
DATA_DIR = '/home/mkeoliya/projects/mc-med/mc-med-1.0.0/data'

In [3]:
NUM_COLS = ['HR', 'RR', 'SpO2', 'SBP', 'DBP', 'MAP', 'Temp', 'Perf'] # Pain HRV, LPM_O2

EXCLUDE INITIALLY SICK

In [3]:
visits_cols = ['MRN', 'CSN', #'Age', 'Gender', 'Race', 'Ethnicity', 
               'Triage_HR', 'Triage_SpO2', 'CC', 'Dx_ICD10', 'Dx_name',
               'Arrival_time', 'Roomed_time', 'Dispo_time', 'Admit_time', 'Departure_time']

In [4]:
visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=visits_cols)

In [5]:
visits_df['Arrival_time'] = pd.to_datetime(visits_df['Arrival_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))
# visits_df['Roomed_time'] = pd.to_datetime(visits_df['Roomed_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))

In [6]:
triage_df = visits_df[['CSN', 'Triage_HR', 'Triage_SpO2']]
triage_df = triage_df[(triage_df['Triage_HR'] > 110) | (triage_df['Triage_SpO2'] < 90)]

EXCLUDE INITIALLY SICK

In [7]:
N = 90 # MINUTES

In [9]:
numerics_df = pd.read_csv(os.path.join(DATA_DIR, 'numerics.csv'))
numerics_df = numerics_df[~numerics_df['CSN'].isin(triage_df['CSN'])]

In [10]:
numerics_df['Time'] = pd.to_datetime(numerics_df['Time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))

In [11]:
initial_df = numerics_df.merge(visits_df[['CSN', 'Arrival_time', 'Roomed_time']], on='CSN')

In [12]:
initial_df = initial_df[initial_df['Time'] <= initial_df['Arrival_time'] + pd.Timedelta(minutes=N)]

In [13]:
initial_df = initial_df[((initial_df['Measure'] == 'HR') & (initial_df['Value'] > 110))
                        |(initial_df['Measure'] == 'MAP') & (initial_df['Value'] < 65)
                        |(initial_df['Measure'] == 'SpO2') & (initial_df['Value'] < 90)]

In [14]:
numerics_df = numerics_df[~numerics_df['CSN'].isin(initial_df['CSN'])]

In [15]:
numerics_df['CSN'].nunique()

79948

EXCLUDE FEW EVENTS

In [None]:
measure_counts = numerics_df.groupby('CSN')['Measure'].unique()
csns_with_many_measures = measure_counts.apply(lambda x: set(list(x)).issuperset(set(NUM_COLS))).index
numerics_df = numerics_df[numerics_df['CSN'].isin(csns_with_many_measures)]

In [16]:
measure_counts = numerics_df.groupby('CSN')['Measure'].count()
csns_with_many_measures = measure_counts[measure_counts >= 10].index
numerics_df = numerics_df[numerics_df['CSN'].isin(csns_with_many_measures)]

In [17]:
numerics_df['CSN'].nunique()

75531

POSITIVE LABELS

In [18]:
HR_df = numerics_df[(numerics_df['Measure'] == 'HR') & (numerics_df['Value'] > 110)]
MAP_df = numerics_df[(numerics_df['Measure'] == 'MAP') & (numerics_df['Value'] < 65)]
SpO2_df = numerics_df[(numerics_df['Measure'] == 'SpO2') & (numerics_df['Value'] < 90)]

In [19]:
pos_df = pd.concat([HR_df, MAP_df, SpO2_df])
pos_df = pos_df[['CSN', 'Time']].groupby('CSN').agg(min).reset_index().rename(columns={'Time': 'Criteria_time'})

  pos_df = pos_df[['CSN', 'Time']].groupby('CSN').agg(min).reset_index().rename(columns={'Time': 'Criteria_time'})


In [20]:
print(pos_df['CSN'].nunique())

10863


CREATE DATASET

In [21]:
df = visits_df[['CSN', 'Arrival_time', 'Roomed_time']]
df = df[df['CSN'].isin(numerics_df['CSN'])]

In [22]:
df = df.merge(pos_df, on='CSN', how='left')

In [23]:
df['Label'] = ~df['Criteria_time'].isna()

In [24]:
df['Criteria_from_arrive'] = (df['Criteria_time'] - df['Arrival_time']).dt.total_seconds() / 3600
# df['Criteria_from_room'] = (df['Criteria_time'] - df['Roomed_time']).dt.total_seconds() / 3600

In [25]:
df = df.drop(columns=['Arrival_time', 'Roomed_time', 'Criteria_time'], axis=1)

In [28]:
df.to_parquet("data/decompensation.parquet")

NEG SAMPLE TIMES

In [29]:
neg_df = df[df['Label'] == False]
pos_df = df[df['Label'] == True]

In [30]:
from scipy.stats import gaussian_kde

In [31]:
original = (pos_df['Criteria_from_arrive'].values)
kde = gaussian_kde(original)
neg_df['Sample_time'] = kde.resample(size=len(neg_df)).flatten()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_df['Sample_time'] = kde.resample(size=len(neg_df)).flatten()


In [32]:
label_df = pd.concat([neg_df, pos_df])
df = df.merge(label_df[['CSN', 'Sample_time']], on='CSN', how='left')

In [34]:
df.to_parquet("data/decompensation.parquet")

SPLIT

In [16]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(df, test_size=0.4, random_state=1234)
X_test, X_val = train_test_split(X_test, test_size=0.5, random_state=1234)

In [None]:
X_train.to_parquet('data/decomp_train.parquet')
X_val.to_parquet('data/decomp_val.parquet')
X_test.to_parquet('data/decomp_test.parquet')

ADD NUMERICS

In [5]:
label_df = pd.read_parquet('data/decompensation.parquet')

In [None]:
COLS_NAME = ['event', 'time', 'value']
COLS = {
    "labs": ['Component_name', 'Order_time', 'Component_value'],
    "numerics": ['Measure', 'Time', 'Value'],
    "orders": ['Procedure_ID', 'Order_time'],
}

In [7]:
def try_parse_float(x):
    if isinstance(x, str):
        try:
            x = x.lstrip('<>=').strip()
            x = x.replace(',', ':')
            if ':' in x:
                try:
                    num, denom = x.split(':')
                    return float(num) / float(denom)
                except (ValueError, ZeroDivisionError):
                    return x
            return float(x)
        except ValueError:
            return x
    return x

In [8]:
def read_csv_fn(fn, label_df):
    cols = COLS[fn]
    df = pd.read_csv(os.path.join(DATA_DIR, f'{fn}.csv'), usecols=['CSN'] + cols)
    df = df[df['CSN'].isin(label_df['CSN'])]
    df = df.rename(columns={c: COLS_NAME[i] for i, c in enumerate(cols)})
    df = df.dropna()
    df['event'] = df['event'].astype(str)
    return df

In [9]:
def bucket_eventval(event, val, d):
    buckets = d[event]
    ind = np.searchsorted(buckets, val, side='right')
    if ind == len(buckets):
        eventval = f"{event}|{buckets[ind-1]}-"
    else:
        eventval = f"{event}|{buckets[ind-1]}-{buckets[ind]}"
    return eventval

In [10]:
def bucket_ind(event, val, d):
    buckets = d[event]
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [13]:
def create_df():
    visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'Arrival_time', 'Roomed_time']) # 'Age', 'Gender', 'Race', 'Ethnicity'

    vitals_df = read_csv_fn('numerics', label_df)
    # vitals_df = vitals_df[vitals_df['event'].isin(NUM_COLS)]
    
    with open('next_token/numerics_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)
    vitals_df['eventval'] = vitals_df.apply(lambda x: bucket_eventval(x['event'], x['value'], buckets), axis=1)
    vitals_df['buckets'] = vitals_df.apply(lambda x: bucket_ind(x['event'], x['value'], buckets), axis=1)

    df = vitals_df.merge(visits_df, on='CSN', how='left')

    df['time'] = df['time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['Arrival_time'] = df['Arrival_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['Roomed_time'] = df['Roomed_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])

    df['time_arrive'] = pd.to_datetime(df['time']) - pd.to_datetime(df['Arrival_time'])
    df['time_arrive'] = df['time_arrive'].dt.total_seconds() / 3600

    df['time_room'] = pd.to_datetime(df['time']) - pd.to_datetime(df['Roomed_time'])
    df['time_room'] = df['time_room'].dt.total_seconds() / 3600

    df = df.drop(columns=['Arrival_time', 'Roomed_time', 'time'])
    return df

In [14]:
df = create_df()

In [15]:
df.to_parquet("data/decompensation_data.parquet")

ADD OTHER EVENTS

In [20]:
def create_df():
    visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'Arrival_time', 'Roomed_time']) # 'Age', 'Gender', 'Race', 'Ethnicity'

    vitals_df = read_csv_fn('numerics', label_df)
    with open('next_token/numerics_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)
    vitals_df['eventval'] = vitals_df.apply(lambda x: bucket_eventval(x['event'], x['value'], buckets), axis=1)
    vitals_df['buckets'] = vitals_df.apply(lambda x: bucket_ind(x['event'], x['value'], buckets), axis=1)

    labs_df = read_csv_fn('labs', label_df)
    labs_df['value'] = labs_df['value'].replace([None], 0.0).apply(try_parse_float)
    labs_df['value'] = labs_df['value'].apply(lambda x: 0.0 if isinstance(x, str) and 'pos' in x.lower() else x)
    labs_df['value'] = labs_df['value'].apply(lambda x: 1.0 if isinstance(x, str) and (any(sub in x.lower() for sub in ['neg', 'not', 'none', 'auto'])) else x)

    with open('next_token/labs_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)
    labs_df['eventval'] = labs_df.apply(lambda x: bucket_eventval(x['event'], x['value'], buckets), axis=1)
    labs_df['buckets'] = labs_df.apply(lambda x: bucket_ind(x['event'], x['value'], buckets), axis=1)

    orders_df = read_csv_fn('orders', label_df)
    orders_df['value'] = 0
    orders_df['buckets'] = 0
    orders_df['eventval'] = orders_df['event']

    df = pd.concat([labs_df, vitals_df, orders_df])
    df = df.merge(visits_df, on='CSN', how='left')

    df['time'] = df['time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['Arrival_time'] = df['Arrival_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['Roomed_time'] = df['Roomed_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])

    df['time_arrive'] = pd.to_datetime(df['time']) - pd.to_datetime(df['Arrival_time'])
    df['time_arrive'] = df['time_arrive'].dt.total_seconds() / 3600

    df['time_room'] = pd.to_datetime(df['time']) - pd.to_datetime(df['Roomed_time'])
    df['time_room'] = df['time_room'].dt.total_seconds() / 3600

    df = df.drop(columns=['Arrival_time', 'Roomed_time', 'time'])
    return df

In [21]:
df = create_df()

In [22]:
df.to_parquet("data/decomp_data.parquet")

UPDATE LABELS

In [38]:
TASK = 'eSOFA' # 'eSOFA' 'decompensation

In [40]:
df = pd.read_parquet(f'data/{TASK}_demo.parquet')
label_df = pd.read_parquet(f'data/{TASK}.parquet')

In [None]:
label_df['Time'] = label_df.apply(lambda x: x['Trigger_time'] if x['Label'] else x['Sample_time'], axis=1)

In [None]:
df['Gender_ind'] = pd.factorize(df['Gender'])[0]
df['Race_ind'] = pd.factorize(df['Race'])[0]
df['Ethnicity_ind'] = pd.factorize(df['Ethnicity'])[0]

In [42]:
df['Race_str'] = 'Race|' + df['Race']
df['Ethnicity_str'] = 'Ethnicity|' + df['Ethnicity']
df['Gender_str'] = 'Gender|' + df['Gender']

In [43]:
bins = [0, 20, 30, 40, 50, 60, 70, 80, 90]
labels = ['0-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90']
df['Age_str'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

In [45]:
df.to_parquet(f'data/{TASK}_demo.parquet')

In [None]:
label_df.to_parquet(f'data/{TASK}.parquet')

ADD CRITERIA

In [8]:
with open('next_token/numerics_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)

In [None]:
hr_bucket = np.searchsorted(buckets['HR'], 110, side='right')
map_bucket = np.searchsorted(buckets['MAP'], 65, side='right')
SpO2_bucket = np.searchsorted(buckets['SpO2'], 90, side='right')
print(hr_bucket, map_bucket, SpO2_bucket)

In [26]:
hr_buckets = [10, 11]
map_bucket = [1]
SpO2_buckets = [1]

In [27]:
hr_criteria, map_criteria, spo2_criteria = [], [], []
for inds, event, criteria in [(hr_buckets, 'HR', hr_criteria), (map_bucket, 'MAP', map_criteria), (SpO2_buckets, 'SpO2', spo2_criteria)]:
    for ind in inds:
        bucket = buckets[event]
        if ind == len(bucket):
            eventval = f"{event}|{bucket[ind-1]}-"
        else:
            eventval = f"{event}|{bucket[ind-1]}-{bucket[ind]}"
        criteria.append(eventval)
hr_criteria, map_criteria, spo2_criteria

(['HR|106.0667-268.0', 'HR|268.0-'], ['MAP|1.0-75.6667'], ['SpO2|0.0-94.7907'])

TOKENIZER

In [8]:
df = pd.read_parquet('/home/mkeoliya/projects/mc-med/data/eventval.parquet')
df2 = pd.read_parquet('/home/mkeoliya/projects/mc-med/data/decompensation_demo.parquet')
df3 = pd.read_parquet('/home/mkeoliya/projects/mc-med/data/eSOFA_demo.parquet')

In [9]:
vocab_list = list(set().union(*df['eventval']))
vocab_list += list(df2['Race_str'].unique()) + list(df2['Ethnicity_str'].unique()) + list(df2['Gender_str'].unique()) + list(df2['Age_str'].unique())

In [10]:
vocab_list = [x for x in vocab_list if x is not None]
vocab_list = list(set(vocab_list))

In [12]:
from tokenizers import Tokenizer, models

In [13]:
special_list = ['[UNK]', '[BOS]', '[EOS]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
vocab_list = special_list + vocab_list

vocab = {token: idx for idx, token in enumerate(vocab_list)}

tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="[UNK]"))

tokenizer.save(os.path.join('data', f'eventval_demo_tokenizer.json'))