In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = "data"

In [3]:
def try_parse_float(x):
    if isinstance(x, str):
        try:
            x = x.lstrip('<>=').strip()
            x = x.replace(',', ':')
            if ':' in x:
                try:
                    num, denom = x.split(':')
                    return float(num) / float(denom)
                except (ValueError, ZeroDivisionError):
                    return x
            return float(x)
        except ValueError:
            return x
    return x

Inclusion Criteria

In [4]:
# < 96.8, > 100.4 within 24 hours of admission

In [5]:
triage_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'Triage_Temp', 'Arrival_time'])
triage_df['Arrival_time'] = pd.to_datetime(triage_df['Arrival_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))

In [6]:
temp_df = triage_df[(triage_df['Triage_Temp'] > 100.4) | (triage_df['Triage_Temp'] < 96.8)]
temp_df = temp_df.rename(columns={'Arrival_time': 'Temp_time'})
temp_df = temp_df[['CSN', 'Temp_time']]

In [7]:
numerics_df = pd.read_csv(os.path.join(DATA_DIR, 'numerics.csv'), usecols=['CSN', 'Measure', 'Value', 'Time'])
numerics_df = numerics_df[numerics_df['Measure'] == 'Temp']
numerics_df = numerics_df[(numerics_df['Value'] > 100.4) | (numerics_df['Value'] < 96.8)]
numerics_df = numerics_df.groupby('CSN').min()

In [8]:
triage_df = triage_df[['CSN', 'Arrival_time']]
numerics_df = numerics_df.merge(triage_df, on='CSN', how='left')
numerics_df['Time'] = pd.to_datetime(numerics_df['Time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))
numerics_df = numerics_df[numerics_df['Time'] < numerics_df['Arrival_time'] + pd.Timedelta(hours=24)]
numerics_df = numerics_df[['CSN', 'Time']]
numerics_df = numerics_df.rename(columns={'Time': 'Temp_time'})

In [9]:
temp_df = pd.concat([temp_df, numerics_df])
temp_df = temp_df.drop_duplicates()

In [None]:
# Either within 12 hours 
# > 12000 or <4000 per microL
# HR > 90
# RR > 20

In [10]:
labs_df = pd.read_csv(os.path.join(DATA_DIR, 'labs.csv'), usecols=['CSN', 'Order_time', 'Component_name', 'Component_value'])
labs_df = labs_df[labs_df['CSN'].isin(temp_df['CSN'])]
labs_df = labs_df[(labs_df['Component_name'] == 'WBC COUNT') | (labs_df['Component_name'] == 'WHITE BLOOD CELLS (WBC)')]
labs_df['Component_value'] = labs_df['Component_value'].apply(try_parse_float)
labs_df = labs_df[(labs_df['Component_name'] == 'WBC COUNT') & (labs_df['Component_value'] > 12000)
                | (labs_df['Component_name'] == 'WBC COUNT') & (labs_df['Component_value'] < 4000)
                | (labs_df['Component_name'] == 'WHITE BLOOD CELLS (WBC)') & (labs_df['Component_value'] > 12)
                | (labs_df['Component_name'] == 'WHITE BLOOD CELLS (WBC)') & (labs_df['Component_value'] < 4)]
labs_df = labs_df[['CSN', 'Order_time']]
labs_df = labs_df.rename(columns={'Order_time': 'Vital_time'})

In [11]:
visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'Triage_HR', 'Triage_RR', 'Arrival_time'])
visits_df = visits_df[visits_df['CSN'].isin(temp_df['CSN'])]
visits_df = visits_df[(visits_df['Triage_HR'] > 90) |(visits_df['Triage_RR'] > 20)]
visits_df = visits_df[['CSN', 'Arrival_time']]
visits_df = visits_df.rename(columns={'Arrival_time': 'Vital_time'})

In [12]:
numerics_df = pd.read_csv(os.path.join(DATA_DIR, 'numerics.csv'), usecols=['CSN', 'Measure', 'Value', 'Time'])
numerics_df = numerics_df[numerics_df['CSN'].isin(temp_df['CSN'])]
numerics_df = numerics_df[((numerics_df['Measure'] == 'HR') & numerics_df['Value'] > 90)
                         |((numerics_df['Measure'] == 'RR') & numerics_df['Value'] > 20)]
numerics_df = numerics_df[['CSN', 'Time']]
numerics_df = numerics_df.rename(columns={'Time': 'Vital_time'})

In [13]:
vitals_df = pd.concat([labs_df, visits_df, numerics_df])
vitals_df['Vital_time'] = pd.to_datetime(vitals_df['Vital_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))
vitals_df = vitals_df.groupby('CSN').min()

In [14]:
vitals_df = vitals_df.merge(triage_df, on='CSN', how='left')
vitals_df = vitals_df.merge(temp_df, on='CSN', how='inner')
vitals_df = vitals_df[(vitals_df['Vital_time'] < vitals_df['Arrival_time'] + pd.Timedelta(hours=24))
                     |((vitals_df['Vital_time'] - vitals_df['Temp_time']).abs() < pd.Timedelta(hours=12))]

In [None]:
# No iv_antibiotics at or before first criteria

In [15]:
ANTIBIOTICS = ['amikacin', 'ampicillin', 'sulbactam', 'azithromycin', 'aztreonam', 'cefamandole', 'cefazolin', 'cefepime', 'cefmetazole', 'cefonicid', 'cefoperazone', 
                 'cefotaxime', 'cefotetan', 'cefoxitin', 'ceftaroline', 'ceftazidime', 'ceftazidime', 'avibactam', 'ceftizoxime', 'ceftolozane', 'tazobactam', 'ceftriaxone', 
                 'cefuroxime', 'cephalothin', 'cephapirin', 'chloramphenicol', 'ciprofloxacin', 'clindamycin', 'cloxacillin', 'colistin', 'dalbavancin', 'daptomycin', 'doripenem', 
                 'doxycycline', 'ertapenem', 'gatifloxacin', 'gentamicin', 'imipenem', 'kanamycin', 'levofloxacin', 'lincomycin', 'linezolid', 'meropenem', 'methicillin', 
                 'metronidazole', 'mezlocillin', 'minocycline', 'moxifloxacin', 'nafcillin', 'oritavancin', 'oxacillin', 'penicillin', 'piperacillin', 'pileracillin', 'tazobactam', 
                 'polymyxin B', 'quinupristin', 'dalfopristin', 'streptomycin', 'tedizolid', 'telavancin', 'ticarcillin', 'ticarcillin', 'clavulanate', 'tigecycline', 'tobramycin', 
                 'trimethoprim', 'sulfamethoxazole', 'vancomycin', 'amoxicillin', 'clavulanate', 'amoxicillin', 'ampicillin', 'azithromycin', 'cefaclor', 'cefadroxil', 'cefdinir', 
                 'cefditoren', 'cefixime', 'cefpodoxime', 'cefprozil', 'ceftibuten', 'cefuroxime', 'cephalexin', 'cephradine', 'chloramphenicol', 'cinoxacin', 'ciprofloxacin', 
                 'clarithromycin', 'clindamycin', 'cloxacillin', 'dicloxacillin', 'doxycycline', 'fidaxomicin', 'fosfomycin', 'gatifloxacin', 'levofloxacin', 'lincomycin', 
                 'linezolid', 'metronidazole', 'minocycline', 'moxifloxacin', 'nitrofurantoin', 'norfloxacin', 'ofloxacin', 'penicillin', 'pivampicillin', 'rifampin', 
                 'sulfadiazine', 'sulfadiazine', 'trimethoprim', 'sulfamethoxazole', 'sulfisoxazole', 'tedizolid', 'telithromycin', 'tetracycline', 'trimethoprim', 'trimethoprim', 
                 'sulfamethoxazole', 'vancomycin', 'amphotericin B', 'anidulafungin', 'caspofungin', 'fluconazole', 'itraconazole', 'micafungin', 'posaconazole', 'voriconazole',
                 'fluconazole', 'itraconazole', 'posaconazole', 'voriconazole', 'acyclovir', 'ganciclovir', 'cidofovir', 'foscarnet', 'peramivir', 'Oseltamivir']

In [16]:
orders_df = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv'), usecols=['CSN', 'Order_time', 'Procedure_name'])
orders_df['Order_time'] = pd.to_datetime(orders_df['Order_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))
orders_df = orders_df[orders_df['CSN'].isin(vitals_df['CSN'])]
orders_df = orders_df[orders_df['Procedure_name'].str.contains('|'.join(ANTIBIOTICS), na=False, case=False)]
orders_df = orders_df[['CSN', 'Order_time']]
orders_df = orders_df.groupby('CSN').min()

In [17]:
cohort_df = orders_df.merge(vitals_df, on='CSN')
cohort_df = cohort_df[cohort_df['Order_time'] > cohort_df['Vital_time']]
cohort_df = cohort_df[['CSN', 'Arrival_time']]
cohort_df = cohort_df.drop_duplicates()

In [46]:
cohort_df['CSN'].nunique()

15027

Suspicion of infection

In [19]:
orders_df = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv'), usecols=['CSN', 'Order_time', 'Procedure_name'])
orders_df['Order_time'] = pd.to_datetime(orders_df['Order_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))

In [20]:
orders_df = orders_df[orders_df['CSN'].isin(cohort_df['CSN'])]

In [21]:
# Blood culture order within first 48 hours
blood_df = orders_df[(orders_df['Procedure_name'].str.contains("blood", case=False, na=False))
                    & (orders_df['Procedure_name'].str.contains("cult", case=False, na=False))]
blood_df = blood_df[['CSN', 'Order_time']]

In [22]:
blood_df = blood_df.groupby('CSN').min()
blood_df = blood_df.merge(triage_df, on='CSN', how='left')
blood_df = blood_df[blood_df['Order_time'] < blood_df['Arrival_time'] + pd.Timedelta(hours=48)]

In [None]:
# 4QAD within BC - at least 1 in IV, less than 1 day gap

In [23]:
IV = ['amikacin', 'ampicillin', 'sulbactam', 'azithromycin', 'aztreonam', 'cefamandole', 'cefazolin', 'cefepime', 'cefmetazole', 'cefonicid', 
    'cefoperazone', 'cefotaxime', 'cefotetan', 'cefoxitin', 'ceftaroline', 'ceftazidime', 'ceftazidime', 'avibactam', 'ceftizoxime', 'ceftolozane', 'tazobactam', 
    'ceftriaxone', 'cefuroxime', 'cephalothin', 'cephapirin', 'chloramphenicol', 'ciprofloxacin', 'clindamycin', 'cloxacillin', 'colistin', 'dalbavancin', 
    'daptomycin', 'doripenem', 'doxycycline', 'ertapenem', 'gatifloxacin', 'gentamicin', 'imipenem', 'kanamycin', 'levofloxacin', 'lincomycin', 'linezolid', 
    'meropenem', 'methicillin', 'metronidazole', 'mezlocillin', 'minocycline', 'moxifloxacin', 'nafcillin', 'oritavancin', 'oxacillin', 'penicillin', 'piperacillin',
    'pileracillin', 'tazobactam', 'polymyxin B', 'quinupristin', 'dalfopristin', 'streptomycin', 'tedizolid', 'telavancin', 'ticarcillin', 'ticarcillin', 
    'clavulanate', 'tigecycline', 'tobramycin', 'trimethoprim', 'sulfamethoxazole', 'vancomycin']

In [24]:
anti_df = orders_df[orders_df['Procedure_name'].str.contains('|'.join(ANTIBIOTICS), na=False, case=False)]
anti_df['IV'] = anti_df['Procedure_name'].apply(lambda x: any([i in x.lower() for i in IV]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anti_df['IV'] = anti_df['Procedure_name'].apply(lambda x: any([i in x.lower() for i in IV]))


In [96]:
anti_df = anti_df[anti_df['CSN'].isin(anti_df.groupby('CSN')['Order_time'].agg(lambda x: x.max() - x.min()).loc[lambda x: x > pd.Timedelta(days=1)].index)]

In [25]:
anti_df = anti_df.sort_values(['CSN', 'Order_time']).groupby('CSN').agg(list).reset_index()

In [101]:
anti_df = anti_df[anti_df['Order_time'].apply(lambda x: (pd.Series(x).diff().iloc[1:] < pd.Timedelta(days=1)).all())]

In [26]:
anti_df = anti_df[anti_df['IV'].apply(sum) > 0]
anti_df = anti_df[anti_df['IV'].apply(sum) >= 4]

In [27]:
infection = list(set(blood_df['CSN']).intersection(anti_df[anti_df['IV'].apply(len) >= 2]['CSN']))
len(infection)

3873

eSOFA

In [28]:
blood_df = blood_df.rename(columns={'Order_time': 'Blood_time'})
blood_df = blood_df[blood_df['CSN'].isin(infection)]

In [29]:
def two_blood_time(df):
    df = df.merge(blood_df, on='CSN', how='left')
    df = df[(df['Order_time'] - df['Blood_time']).abs() <= pd.Timedelta(days=2)]
    return df

In [30]:
MV = ['RESP - VENTILATOR SETTINGS', 'RESP - NON INVASIVE POS. PRESS VENT (BIPAP/CPAP)',
      'RESP - LUNG PROTECTIVE VENTILATION PROTOCOL', 'RESP - NITRIC OXIDE VENT',
      'RESP - INTRAPULMONARY PERCUSSIVE VENTILATION', 'RESP - WEAN VENTILATOR',
      'RESP - NON INVASIVE POS. PRESS VENT FOR NEUROMUSCULAR DISEASE',
      'RESP - MONITOR, PERFORM VENTILATOR AND PATIENT ASSESSMENT',
      'LEAK SPEECH VENTILATION']
mv_df = orders_df[(orders_df['Procedure_name'].isin(MV))]

In [31]:
CARDIO = ['norepinephrine', 'dopamine', 'epinephrine', 'phenylephrine', 'vasopressin']
cardio_df = orders_df[(orders_df['Procedure_name'].str.contains('|'.join(CARDIO), na=False, case=False)) & (orders_df['Procedure_name'].str.contains('IV', na=False, case=False))]

In [32]:
mv_df = two_blood_time(mv_df)
cardio_df = two_blood_time(cardio_df)

In [33]:
labs_df = pd.read_csv(os.path.join(DATA_DIR, 'labs.csv'), usecols=['CSN', 'Order_time', 'Component_name', 'Component_value', 'Component_units'])
labs_df['Component_value'] = labs_df['Component_value'].apply(try_parse_float)
labs_df['Order_time'] = pd.to_datetime(labs_df['Order_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))

In [34]:
LACTATE = ['POC:LACTATE, ISTAT', 'LACTATE.WHOLE BLD']
lactate_df = labs_df[labs_df['Component_name'].isin(LACTATE)]
lactate_df = lactate_df[lactate_df['Component_value'] > 2]

In [35]:
lactate_df = two_blood_time(lactate_df)

In [36]:
HEPATIC = ['BILIRUBIN, TOTAL']
bilirubin_df = labs_df[labs_df['Component_name'].isin(HEPATIC)]

In [37]:
bilirubin_df = two_blood_time(bilirubin_df)
bilirubin_min = bilirubin_df.groupby('CSN')['Component_value'].min().reset_index(name='min')
bilirubin_df = bilirubin_df.merge(bilirubin_min, how='left', on= 'CSN')
bilirubin_df = bilirubin_df[(bilirubin_df['Component_value'] > 2) & (bilirubin_df['Component_value'] > 2 * bilirubin_df['min'])]

In [38]:
CREATININE = ['CREATININE', 'POC:CREATININE,ISTAT', 'CREATININE, SERUM']
creatinine_df = labs_df[labs_df['Component_name'].isin(CREATININE)]

In [39]:
creatinine_df = two_blood_time(creatinine_df)
creatinine_df_min = creatinine_df.groupby('CSN')['Component_value'].min().reset_index(name='min')
creatinine_df = creatinine_df.merge(creatinine_df_min, how='left', on= 'CSN')
creatinine_df = creatinine_df[(creatinine_df['Component_value'] > 2 * creatinine_df['min'])]

In [40]:
GFR = ['EGFR FOR AFRICAN AMERICAN', 'EGFR REFIT WITHOUT RACE (2021)', 'EGFR (CYSTATIN C)', 'EGFR (CREAT/CYSTATIN C)',
       'POC:EGFR FOR AFRICAN AMERICAN', 'POC:EGFR REFIT WITHOUT RACE (2021)', 'EGFR BY CYSTATIN C:SERUM']
gfr_df = labs_df[labs_df['Component_name'].isin(GFR)]

In [41]:
gfr_df = two_blood_time(gfr_df)
gfr_max = gfr_df.groupby('CSN')['Component_value'].max().reset_index(name='max')
gfr_df = gfr_df.merge(gfr_max, how='left', on= 'CSN')
gfr_df = gfr_df[(gfr_df['Component_value'] * 2 < gfr_df['max'])]

In [42]:
COAG = ['PLATELET COUNT (PLT)']
coag_df = labs_df[labs_df['Component_name'].isin(COAG)]

In [43]:
coag_df = two_blood_time(coag_df)
coag_max = coag_df.groupby('CSN')['Component_value'].max().reset_index(name='max')
coag_df = coag_df.merge(coag_max, how='left', on= 'CSN')
coag_df = coag_df[(coag_df['max'] > 100) & (coag_df['Component_value'] * 2 < coag_df['max']) & (coag_df['Component_value'] < 100)]

In [44]:
eSOFA = pd.concat([mv_df[['CSN', 'Order_time']], cardio_df[['CSN', 'Order_time']], 
                   lactate_df[['CSN', 'Order_time']], bilirubin_df[['CSN', 'Order_time']], 
                   creatinine_df[['CSN', 'Order_time']], gfr_df[['CSN', 'Order_time']], 
                   coag_df[['CSN', 'Order_time']]])
eSOFA = eSOFA.groupby('CSN').min().reset_index()
eSOFA = eSOFA.rename(columns={'Order_time' : 'Trigger_time'})
len(eSOFA)

1704

In [121]:
eSOFA = cohort_df.merge(eSOFA, on='CSN', how='left')
eSOFA['Label'] = ~(eSOFA['Trigger_time'].isna())
eSOFA['Trigger_time'] = (eSOFA['Trigger_time'] - eSOFA['Arrival_time']).dt.total_seconds() / 3600
eSOFA = eSOFA[['CSN', 'Label', 'Trigger_time']]

In [123]:
eSOFA['CSN'].nunique()

14866

In [60]:
eSOFA.to_parquet('eSOFA.parquet')

In [None]:
label_df = eSOFA[~eSOFA['Trigger_time'].isna()]
bin_width = 0.5
min_val = label_df['Trigger_time'].min()
max_val = label_df['Trigger_time'].max()
bins = np.arange(min_val, max_val + bin_width, bin_width)

# Plot the histogram
plt.figure(figsize=(12, 4))
plt.hist(label_df['Trigger_time'], bins=bins, edgecolor='black')
plt.title('Time-to-Event')
plt.xlabel('Trigger_time (h)')
plt.ylabel('# Encounters')
plt.grid(True)
plt.show()

qSOFA

In [None]:
order_df = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv'), usecols=['CSN', 'Procedure_name', 'Order_time'])
visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'CC', 'Triage_SBP', 'Triage_RR', 'Arrival_time'])
order_df = order_df.rename(columns={'Order_time': 'Time'})
visits_df = visits_df.rename(columns={'Arrival_time': 'Time'})

In [None]:
# altered mental status
mental_df = order_df[order_df['Procedure_name'].str.contains('mental', case=False, na=False)][['CSN', 'Time']]
mental_visits_df = visits_df[visits_df['CC'].str.contains('altered mental', case=False, na=False)][['CSN', 'Time']]
mental_df = pd.concat([mental_df, mental_visits_df])

In [None]:
numerics_df = pd.read_csv(os.path.join(DATA_DIR, 'numerics.csv'), usecols=['CSN', 'Measure', 'Value', 'Time'])

In [None]:
# SBP < 100mmHg
sbp_df = numerics_df[(numerics_df['Measure'] == 'SBP') & (numerics_df['Value'] < 100)][['CSN', 'Time']]
sbp_df2 = visits_df[visits_df['Triage_SBP'] < 100][['CSN', 'Time']]
sbp_df = pd.concat([sbp_df, sbp_df2])

In [None]:
# RR > 22/min
rr_df = numerics_df[(numerics_df['Measure'] == 'RR') & (numerics_df['Value'] > 22)][['CSN', 'Time']]
rr_df2 = visits_df[visits_df['Triage_RR'] > 22][['CSN', 'Time']]
rr_df = pd.concat([rr_df, rr_df2])

In [None]:
mental_df['Measure'] = 'MENTAL'
sbp_df['Measure'] = 'SBP'
rr_df['Measure'] = 'RR'

In [None]:
qSOFA = pd.concat([mental_df, sbp_df, rr_df])
qSOFA['Time'] = pd.to_datetime(qSOFA['Time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:]))
qSOFA = qSOFA.sort_values(['CSN', 'Time']).groupby('CSN').agg(list).reset_index()
qSOFA = qSOFA[qSOFA['Measure'].apply(lambda x: len(set(x))) >= 3]

Create Dataset

In [64]:
import pickle

In [61]:
COLS_NAME = ['event', 'time', 'value']
COLS = {
    "labs": ['Component_name', 'Order_time', 'Component_value'],
    "numerics": ['Measure', 'Time', 'Value'],
    "orders": ['Procedure_ID', 'Order_time'],
}

In [62]:
def read_csv_fn(fn, label_df):
    cols = COLS[fn]
    df = pd.read_csv(os.path.join(DATA_DIR, f'{fn}.csv'), usecols=['CSN'] + cols)
    df = df[df['CSN'].isin(label_df['CSN'])]
    df = df.rename(columns={c: COLS_NAME[i] for i, c in enumerate(cols)})
    df = df.dropna()
    df['event'] = df['event'].astype(str)
    return df

In [65]:
def bucket_eventval(event, val, d):
    buckets = d[event]
    ind = np.searchsorted(buckets, val, side='right')
    if ind == len(buckets):
        eventval = f"{event}|{buckets[ind-1]}-"
    else:
        eventval = f"{event}|{buckets[ind-1]}-{buckets[ind]}"
    return eventval

In [66]:
def bucket_ind(event, val, d):
    buckets = d[event]
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [71]:
def create_df():
    label_df = pd.read_parquet('eSOFA.parquet')
    visits_df = pd.read_csv(os.path.join(DATA_DIR, 'visits.csv'), usecols=['CSN', 'Arrival_time']) # 'Age', 'Gender', 'Race', 'Ethnicity'

    vitals_df = read_csv_fn('numerics', label_df)
    with open('../next_token/numerics_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)
    vitals_df['eventval'] = vitals_df.apply(lambda x: bucket_eventval(x['event'], x['value'], buckets), axis=1)
    vitals_df['buckets'] = vitals_df.apply(lambda x: bucket_ind(x['event'], x['value'], buckets), axis=1)

    labs_df = read_csv_fn('labs', label_df)
    labs_df['value'] = labs_df['value'].replace([None], 0.0).apply(try_parse_float)
    labs_df['value'] = labs_df['value'].apply(lambda x: 0.0 if isinstance(x, str) and 'pos' in x.lower() else x)
    labs_df['value'] = labs_df['value'].apply(lambda x: 1.0 if isinstance(x, str) and (any(sub in x.lower() for sub in ['neg', 'not', 'none', 'auto'])) else x)

    with open('../next_token/labs_buckets.pkl', 'rb') as f:
        buckets = pickle.load(f)
    labs_df['eventval'] = labs_df.apply(lambda x: bucket_eventval(x['event'], x['value'], buckets), axis=1)
    labs_df['buckets'] = labs_df.apply(lambda x: bucket_ind(x['event'], x['value'], buckets), axis=1)

    orders_df = read_csv_fn('orders', label_df)
    orders_df['value'] = 0
    orders_df['buckets'] = 0
    orders_df['eventval'] = orders_df['event']

    df = pd.concat([labs_df, vitals_df, orders_df])
    df = df.merge(visits_df, on='CSN', how='left')

    df['time'] = df['time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['Arrival_time'] = df['Arrival_time'].apply(lambda x: str(int(x[:4]) - 500) + x[4:])
    df['time'] = pd.to_datetime(df['time']) - pd.to_datetime(df['Arrival_time'])
    df['time'] = df['time'].dt.total_seconds() / 3600
    df = df.drop(columns=['Arrival_time'])
    return df

In [72]:
df = create_df()

In [74]:
df.to_parquet("eSOFA_data.parquet")

eSOFA criteria buckets

In [None]:
import pickle

In [2]:
df = pd.read_parquet("../data/nesy.parquet")

In [None]:
LACTATE = ['POC:LACTATE, ISTAT', 'LACTATE.WHOLE BLD']
HEPATIC = ['BILIRUBIN, TOTAL']
CREATININE = ['CREATININE', 'POC:CREATININE,ISTAT', 'CREATININE, SERUM']
GFR = ['EGFR FOR AFRICAN AMERICAN', 'EGFR REFIT WITHOUT RACE (2021)', 'EGFR (CYSTATIN C)', 'EGFR (CREAT/CYSTATIN C)',
       'POC:EGFR FOR AFRICAN AMERICAN', 'POC:EGFR REFIT WITHOUT RACE (2021)', 'EGFR BY CYSTATIN C:SERUM']
COAG = ['PLATELET COUNT (PLT)']

In [None]:
with open('../next_token/labs_buckets.pkl', 'rb') as f:
    labs_dict = pickle.load(f)

In [72]:
def bucket_ind(event, val):
    buckets = labs_dict[event]
    ind = np.searchsorted(buckets, val, side='right')
    return ind

In [68]:
ranges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [89]:
def criteria_bucket(x):
    buckets = {}
    n_hepatic, n_creatinine, n_gfr, n_coag = x
    n_lactate = 2
    for i, n, fn in zip([HEPATIC, CREATININE, GFR, COAG, LACTATE], [n_hepatic, n_creatinine, n_gfr, n_coag, n_lactate], ['gt', 'gt', 'lt', 'lt', 'gt']):
        if np.isnan(n): continue
        for j in i:
            ind = bucket_ind(j, n)
            if fn == 'gt': buckets[j]= ranges[ind:]
            else: buckets[j] = ranges[:ind+1]
    return buckets

In [90]:
df['criteria_bucket'] = df['criteria'].apply(criteria_bucket)

In [92]:
df = df.to_parquet("../data/nesy.parquet")

eSOFA

In [None]:
DATA_DIR = 'mc-med'
MCMED_DIR = "mc-med/mc-med-1.0.0/data"

CARDIO = ['norepinephrine', 'dopamine', 'epinephrine', 'phenylephrine', 'vasopressin']
MV = ['RESP - VENTILATOR SETTINGS', 'RESP - NON INVASIVE POS. PRESS VENT (BIPAP/CPAP)',
      'RESP - LUNG PROTECTIVE VENTILATION PROTOCOL', 'RESP - NITRIC OXIDE VENT',
      'RESP - INTRAPULMONARY PERCUSSIVE VENTILATION', 'RESP - WEAN VENTILATOR',
      'RESP - NON INVASIVE POS. PRESS VENT FOR NEUROMUSCULAR DISEASE',
      'RESP - MONITOR, PERFORM VENTILATOR AND PATIENT ASSESSMENT',
      'LEAK SPEECH VENTILATION']
LACTATE = ['POC:LACTATE, ISTAT', 'LACTATE.WHOLE BLD']

HEPATIC = ['BILIRUBIN, TOTAL']
CREATININE = ['CREATININE', 'POC:CREATININE,ISTAT', 'CREATININE, SERUM']
GFR = ['EGFR FOR AFRICAN AMERICAN', 'EGFR REFIT WITHOUT RACE (2021)', 'EGFR (CYSTATIN C)', 'EGFR (CREAT/CYSTATIN C)',
       'POC:EGFR FOR AFRICAN AMERICAN', 'POC:EGFR REFIT WITHOUT RACE (2021)', 'EGFR BY CYSTATIN C:SERUM']
COAG = ['PLATELET COUNT (PLT)']

In [None]:
def get_baseline(x):
    hepatic, creatinine, gfr, coag = None, None, None, None
    for e, v in zip(x['event'], x['value']):
        if e in HEPATIC:
            if hepatic is None or v < hepatic: hepatic = v
        elif e in CREATININE:
            if creatinine is None or v < creatinine: creatinine = v
        elif e in GFR:
            if gfr is None or v > gfr: gfr = v
        elif e in COAG:
            if coag is None or v > coag: coag = v
    return hepatic, creatinine, gfr, coag

In [None]:
def renal():
    pmh_df = pd.read_csv(os.path.join(MCMED_DIR, "pmh.csv"), usecols=['MRN', 'Code'])
    visits_df = pd.read_csv(os.path.join(MCMED_DIR, "visits.csv"), usecols=['MRN', 'CSN', 'Dx_ICD9', 'Dx_ICD10'])

    pmh_df = pmh_df[pmh_df['Code'] == 'N186']
    pmh_df = pmh_df.merge(visits_df[['CSN', 'MRN']], on='MRN', how='left')

    visits_df = visits_df[(visits_df['Dx_ICD9'] == 'N18.6') | (visits_df['Dx_ICD10'] == 'N18.6')]

    renal = pd.concat([visits_df[['CSN']], pmh_df[['CSN']]])
    renal = renal.drop_duplicates()
    return renal

In [None]:
def get_criteria_csn(csn, x, renal_ids):
    print(csn)
    hepatic, creatinine, gfr, coag = x
    target_hepatic = hepatic * 2 if not hepatic is None else None
    target_creatinine = None if (csn in renal_ids['CSN'] or creatinine is None) else creatinine * 2
    target_gfr = None if (csn in renal_ids['CSN'] or gfr is None) else gfr / 2
    target_coag = coag / 2 if ((not coag is None) and coag > 100) else None
    return target_hepatic, target_creatinine, target_gfr, target_coag

In [None]:
def get_criteria(x):
    hepatic, creatinine, gfr, coag = x
    target_hepatic = hepatic * 2 if not hepatic is None else None
    target_creatinine = creatinine * 2 if not creatinine is None else None
    target_gfr = gfr / 2 if not gfr is None else None
    target_coag = coag / 2 if ((not coag is None) and coag > 100) else None
    return target_hepatic, target_creatinine, target_gfr, target_coag

In [None]:
def get_eSOFA(x, tokens, tokens_dict):
    n_hepatic, n_creatinine, n_gfr, n_coag = x
    
    resp = [e for e in tokens if e.startswith(tuple(MV))]
    cardio = [e for e in tokens if e.startswith(tuple(CARDIO))]

    lactate = [e for e in tokens if e.startswith(tuple(LACTATE))]
    lactate = [e for e in lactate if (tokens_dict[e][2] is None or tokens_dict[e][2] > 2)]
    eSOFA = resp + cardio + lactate

    if not n_hepatic is None:
        hepatic = [e for e in tokens if e.startswith(tuple(HEPATIC))]
        hepatic = [e for e in hepatic if (tokens_dict[e][2] is None or tokens_dict[e][2] > max(2, n_hepatic))]
        eSOFA += hepatic

    if not n_creatinine is None:   
        creatinine = [e for e in tokens if e.startswith(tuple(CREATININE))]
        creatinine = [e for e in creatinine if (tokens_dict[e][2] is None or tokens_dict[e][2] > n_creatinine)]
        eSOFA += creatinine

    if not n_gfr is None:
        gfr = [e for e in tokens if e.startswith(tuple(GFR))]
        gfr = [e for e in gfr if (tokens_dict[e][1] is None or tokens_dict[e][1] < n_gfr)]
        eSOFA += gfr

    if not n_coag is None:
        coag = [e for e in tokens if e.startswith(tuple(COAG))]
        coag = [e for e in coag if (tokens_dict[e][1] is None or tokens_dict[e][1] < min(100, n_coag))]
        eSOFA += coag

    return eSOFA

In [None]:
df = pd.read_parquet(os.path.join(DATA_DIR, "data/eSOFA_data.parquet"))
label_df = pd.read_parquet(os.path.join(DATA_DIR, "data/eSOFA.parquet"))

In [None]:
renal_ids = renal()

In [None]:
tokens = list(set().union(*df['eventval']))
tokens_dict = {}
for t in tokens:
    if '|' in t:
        event, range = t.split('|')
        if range[0] == '-':
            min_val, max_val = range[1:].split('-', 1)
            min_val = '-' + min_val
        else:
            min_val, max_val = range.split('-', 1)
        
        min_val = float(min_val) if len(min_val) > 0 else None
        max_val = float(max_val) if len(max_val) > 0 else None
        tokens_dict[t] = (event, min_val, max_val)
    else:
        tokens_dict[t] = (t, None, None)

df['baseline'] = df.apply(get_baseline, axis=1)
df['criteria'] = df.apply(lambda x: get_criteria_csn(x['CSN'], x['baseline']), axis=1)
df['eSOFA'] = df['criteria'].apply(lambda x: get_eSOFA(x, tokens, tokens_dict))
df.to_parquet('data/nesy.parquet')