In [None]:
import os
import pandas as pd
import numpy as np
from scipy.stats import linregress, kurtosis, skew

In [None]:
DATA_DIR = f'data/'

In [None]:
def get_labs(phenotype):
    df_phenotype = None

    label_df = pd.read_csv(os.path.join(DATA_DIR, f'benchmark/{phenotype}/labeled_patients.csv'))
    ids = label_df['patient_id'].unique()

    for i in range(0, 34):
        df = pd.read_parquet(os.path.join(DATA_DIR, f'data/output_{i}.parquet'))
        df = df[df['patient_id'].isin(ids)]

        if len(df) == 0: continue
        elif df_phenotype is None:
            df_phenotype = df
        else: df_phenotype = pd.concat([df_phenotype, df])
    return df_phenotype

HYP0GLYCEMIA

In [None]:
HYPERGLYCEMIA = ['SNOMED/33747003', 'LOINC/LP416145-3', 'LOINC/14749-6']

In [None]:
df_hypoglycemia = get_labs('lab_hypoglycemia')
df_hypoglycemia.to_parquet(os.path.join(DATA_DIR, 'hypoglycemia.parquet'))

HYPERKALEMIA

In [None]:
HYPERKALEMIA = ['LOINC/LG7931-1', 'LOINC/LP386618-5', 'LOINC/LG10990-6', 'LOINC/6298-4', 'LOINC/2823-3']

In [None]:
df_hyperkalemia = get_labs('lab_hyperkalemia')
df_hyperkalemia.to_parquet(os.path.join(DATA_DIR, 'hyperkalemia.parquet'))

LABELS

In [None]:
ROOT_DIR = 'data'
TASK_PREFIX = 'guo_' # lab_i, guo_
TASK = 'icu' # 'hyperkalemia', 'hypoglycemia', 'hyponatremia', 'thrombocytopenia', 'anemia' 'chexpert
DATA_DIR = os.path.join(ROOT_DIR, f'{TASK_PREFIX}{TASK}')

In [None]:
label_df = pd.read_csv(os.path.join(DATA_DIR, 'labeled_patients.csv'))
label_df = label_df.drop_duplicates()

In [None]:
three_df = label_df[label_df['value']==3]
two_df = label_df[(label_df['value']==2) & (~label_df['patient_id'].isin(three_df['patient_id']))]
one_df = label_df[(label_df['value']==1) & (~label_df['patient_id'].isin(three_df['patient_id'])) & (~label_df['patient_id'].isin(two_df['patient_id']))]
neg_df = label_df[(label_df['value']==0) & (~label_df['patient_id'].isin(three_df['patient_id'])) 
                  & (~label_df['patient_id'].isin(two_df['patient_id'])) & (~label_df['patient_id'].isin(one_df['patient_id']))]

In [None]:
three_df = three_df.groupby('patient_id').min().reset_index()
two_df = two_df.groupby('patient_id').min().reset_index()
one_df = one_df.groupby('patient_id').min().reset_index()
neg_df = neg_df.groupby('patient_id').min().reset_index()

In [None]:
label_df = pd.concat([three_df, two_df, one_df, neg_df], ignore_index=True)

In [None]:
label_df.to_parquet(os.path.join(DATA_DIR, f'{TASK}_label.parquet'))

In [None]:
for task_name in ['anemia', 'hyperkalemia', 'hypoglycemia', 'hyponatremia', 'thrombocytopenia']:
    task_type = f'lab_{task_name}'
    label_df = pd.read_parquet(f'data/{task_type}/{task_name}_label.parquet')

    label_df['value_multi'] = label_df['value']
    label_df['value']  = label_df['value'] == 3
    label_df.to_parquet(f'data/{task_type}/{task_name}_label.parquet')

DEMOGRAPHICS

In [None]:
task_name = 'thrombocytopenia'      # anemia, hyperkalemia, hypoglycemia, hyponatremia
task_type = f'lab_{task_name}'

In [None]:
df = pd.read_parquet(f'data/{task_type}/{task_name}.parquet')
label_df = pd.read_parquet(f'data/{task_type}//{task_name}_label.parquet')

In [None]:
assert(df['patient_id'].nunique() == len(label_df))

In [None]:
demo_df = df[df['omop_table'] == 'person']
df = df[~(df['omop_table'] == 'person')]

In [None]:
start_time = df[['patient_id', 'start']].groupby('patient_id').min().reset_index()
label_df = label_df.merge(start_time, on='patient_id')

In [None]:
birth_df = demo_df[demo_df['code'] == 'SNOMED/3950001'][['patient_id', 'start']]
birth_df = birth_df.rename(columns={'start': 'birth'})
label_df = label_df.merge(birth_df, on='patient_id', how='left')

In [None]:
label_df['start'] = pd.to_datetime(label_df['start'])
label_df['birth'] = pd.to_datetime(label_df['birth'])
label_df['age'] = label_df.apply(lambda x: x['start'].year - x['birth'].year - ((x['start'].month, x['start'].day) < (x['birth'].month, x['birth'].day)), axis=1)
label_df['prediction_time'] = pd.to_datetime(label_df['prediction_time'])

In [None]:
gender_df = demo_df[demo_df['code'].str.startswith('Gender/')][['patient_id', 'code']]
gender_df = gender_df.rename(columns={'code' : 'gender'})
label_df = label_df.merge(gender_df, on='patient_id', how='left')

In [None]:
race_df = demo_df[demo_df['code'].str.startswith('Race/')][['patient_id', 'code']]
race_df = race_df.rename(columns={'code' : 'race'})
label_df = label_df.merge(race_df, on='patient_id', how='left')
label_df['race'] = label_df['race'].fillna('Race/0')

In [None]:
ethnicity_df = demo_df[demo_df['code'].str.startswith('Ethnicity/')][['patient_id', 'code']]
ethnicity_df = ethnicity_df.rename(columns={'code' : 'ethnicity'})
label_df = label_df.merge(ethnicity_df, on='patient_id', how='left')
label_df['ethnicity'] = label_df['ethnicity'].fillna('Ethnicity/None')

In [None]:
label_df['race'] = label_df['race'].apply(lambda x: x.split('/', 1)[1])
label_df['gender'] = label_df['gender'].apply(lambda x: x.split('/', 1)[1])
label_df['ethnicity'] = label_df['ethnicity'].apply(lambda x: x.split('/', 1)[1])

In [None]:
label_df = label_df.drop(columns=['label_type'])

In [None]:
df['patient_id'].nunique() == len(label_df)

In [None]:
label_df.to_parquet(f'data/{task_type}/{task_name}_label.parquet')
df.to_parquet(f'data/{task_type}/{task_name}.parquet')

In [None]:
for task_name in ['anemia', 'hyperkalemia', 'hypoglycemia', 'hyponatremia', 'thrombocytopenia', 'icu']:
    if task_name == 'icu': task_type = f'guo_{task_name}'
    else: task_type = f'lab_{task_name}'
    demo_df = pd.read_parquet(f'data/{task_type}/{task_name}_demo.parquet')

    demo_df['gender_ind'] = pd.factorize(demo_df['gender'])[0]
    demo_df['ethnicity_ind'] = pd.factorize(demo_df['ethnicity'])[0]
    demo_df['race_ind'] = pd.factorize(demo_df['race'])[0]

    demo_df.to_parquet(f'data/{task_type}/{task_name}_demo.parquet')

SAMPLE_TIME

In [None]:
TASK = 'hypoglycemia'

In [None]:
label_df = pd.read_parquet(f'data/lab_{TASK}/{TASK}_label.parquet')

In [None]:
df = pd.read_parquet(f'data/lab_{TASK}/{TASK}.parquet')
df['start'] = pd.to_datetime(df['start'])
df = df.sort_values(['patient_id', 'start'])

In [None]:
def assign_group_numbers(group):
    time_diff = group['start'].diff().fillna(pd.Timedelta(days=0))
    new_group = (time_diff >= pd.Timedelta(days=1)).cumsum()
    return new_group + 1

In [None]:
df['N'] = df.groupby('patient_id', group_keys=False).apply(assign_group_numbers).reset_index(drop=True)

In [None]:
pred_df = df[['patient_id', 'start', 'N']].merge(label_df[['patient_id', 'prediction_time']], on='patient_id')
pred_df = pred_df[(pred_df['start'] - pred_df['prediction_time']).abs() < pd.Timedelta(hours=24)]
pred_df = pred_df.drop(columns=['start', 'prediction_time']).drop_duplicates()

In [None]:
label_df = label_df.merge(pred_df.groupby('patient_id').min().reset_index(), on='patient_id')
label_df = label_df.rename(columns={'N': 'N_min'})
label_df = label_df.merge(pred_df.groupby('patient_id').max().reset_index(), on='patient_id')
label_df = label_df.rename(columns={'N': 'N_max'})

In [None]:
df = df.merge(label_df[['patient_id', 'N_min', 'N_max']], on='patient_id')

In [None]:
df = df[df['N'] >= df['N_min']]
df = df[df['N'] <= df['N_max']]
df = df.drop(columns=['N', 'N_min', 'N_max'])

In [None]:
df.to_parquet(f'data/lab_{TASK}/{TASK}.parquet')

In [None]:
label_df = label_df.drop(columns=['start', 'N_min', 'N_max'])

In [None]:
start_df = df.groupby('patient_id')['start'].min().reset_index()
label_df = label_df.merge(start_df, on='patient_id', how='left')

In [None]:
pos_df = label_df[label_df['value']]
neg_df = label_df[~label_df['value']]

In [None]:
pos_df = pos_df[(pos_df['prediction_time']  - pos_df['start']) >= pd.Timedelta(hours=1)]

In [None]:
neg_df = neg_df.drop(columns=['prediction_time'])

In [None]:
delta = (pos_df['prediction_time'] - pos_df['start']).dt.total_seconds() / 3600
times = delta.sample(n=int(len(neg_df)*1.2), replace=True) + np.random.normal(0, 0.5, int(len(neg_df)*1.2))
neg_df['Sample_time'] = [t for t in times if t > 1.5][:len(neg_df)]
neg_df['Sample_time'] = neg_df['Sample_time'].apply(lambda x: pd.Timedelta(hours=x)) + neg_df['start']

In [None]:
label_df = pd.concat([pos_df, neg_df], ignore_index=True)

In [None]:
label_df.to_parquet(f'data/lab_{TASK}/{TASK}_label.parquet')