In [1]:
!pip install -q PyAthena

## Prepare Datasets for Predictor Training, Validation and Testing

In [1]:
from dataproc.cohort import query_esbl_pts, remove_dups, observation_window
from dataproc.sampling import generate_samples
from hyper_params import HyperParams
import pandas as pd

In [2]:
# load hyperparams instance
params = HyperParams()

In [3]:
# Select esbl microbiology test
esbl_admits = query_esbl_pts()
# Remove dups
esbl_admits = remove_dups(esbl_admits)
# Create observation window
esbl_admits_window = observation_window(esbl_admits, window_size=params.observation_window_hours)
# Subset columns
pts_labels = esbl_admits_window[['hadm_id', 'RESISTANT_YN']]
pts_labels.to_pickle('data/patient_labels.pkl')

In [4]:
pts_labels = pd.read_pickle('data/patient_labels.pkl')
pts_labels.head()

Unnamed: 0,hadm_id,RESISTANT_YN
4262,101757,0
4267,186474,0
4268,194730,0
4269,112086,0
4270,158569,1


In [5]:
# Loading the features
from dataproc.create_dataset import dataset_creation

features = dataset_creation(pts_labels['hadm_id'])
features.to_pickle('data/features.pkl')

In [6]:
features = pd.read_pickle('data/features.pkl')
features.head()

Unnamed: 0,hadm_id,subject_id,10378-8,10535-3,10839-9,11555-0,11556-8,11557-6,11558-4,13362-9,...,9322-9,admittime,admission_type,admission_location,insurance,language,religion,marital_status,ethnicity,gender
0,100021,29971,,,,3.0,90.0,51.0,7.38,RANDOM,...,,2109-08-17 10:55:00,EMERGENCY,EMERGENCY ROOM ADMIT,Medicaid,SPAN,UNOBTAINABLE,MARRIED,HISPANIC OR LATINO,M
1,100045,1569,2+,,,0.0,152.0,35.0,7.44,RANDOM,...,,2176-02-05 18:40:00,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,ENGL,CATHOLIC,WIDOWED,WHITE,F
2,100050,29633,,,,0.0,519.0,40.0,7.4,RANDOM,...,,2179-09-19 13:00:00,ELECTIVE,PHYS REFERRAL/NORMAL DELI,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,M
3,100053,24912,,,,-8.0,119.0,30.0,7.33,RANDOM,...,,2124-07-14 03:19:00,EMERGENCY,EMERGENCY ROOM ADMIT,Private,,NOT SPECIFIED,SINGLE,WHITE,M
4,100113,58240,,,,,,,,,...,,2153-01-05 00:35:00,EMERGENCY,CLINIC REFERRAL/PREMATURE,Medicare,ENGL,JEWISH,SINGLE,WHITE,M


In [7]:
loinc_codes = list(features.drop(columns=['hadm_id', 'subject_id', 'admittime']).columns)[:-8]

In [8]:
# Embedding the features
from dataproc.embeddings import loinc_values

loinc_vals = loinc_values(loinc_codes)
loinc_vals.dropna(subset=['value'], inplace=True)
loinc_vals = loinc_vals.astype({'value': 'string', 'loinc_code': 'category'})
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('>GREATER THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<LESS THAN '))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.rstrip(' NG/ML'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.lstrip('<>'))
loinc_vals['value'] = loinc_vals['value'].map(lambda x: x.replace(',', '.'))
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO ANALYZE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'MOLYSIS FALSELY DECREASES THIS RESULT'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'COMPUTER NETWORK FAILURE. TEST NOT RESULTED.'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == ':UNABLE TO DETERMINE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO QUANTITATE'].index),  inplace=True)
loinc_vals.drop(list(loinc_vals.loc[loinc_vals['value'] == 'UNABLE TO REPORT'].index),  inplace=True)

In [9]:
numeric = []
categorical = []
weird = []
for code in loinc_codes:
    size = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'])
    size_unique = len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'].unique())
    sum_na = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').isna().sum()
    if sum_na / size < 0.05:
        numeric.append(code)
    elif sum_na / size > 0.05 and size_unique < 100:
        categorical.append(code)
    else:
        weird.append(code)

In [10]:
len(loinc_codes)

144

In [11]:
len(numeric)

95

In [12]:
len(categorical)

38

In [13]:
len(weird)

11

In [14]:
weird

['3297-9',
 '4023-8',
 '4548-4',
 '48065-7',
 '5642-4',
 '5787-7',
 '5793-5',
 '5796-8',
 '5808-1',
 '5821-4',
 '6773-6']

In [15]:
worth_it = ['6773-6','5808-1','5787-7','5821-4']

In [18]:
numeric_stats = []
for code in numeric:
    a = pd.to_numeric(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'], errors='coerce').describe()
    numeric_stats.append(a)
numeric_stats_df = pd.concat(numeric_stats, axis=1, keys=numeric)

In [19]:
numeric_stats_df

Unnamed: 0,10535-3,10839-9,11555-0,11556-8,11557-6,11558-4,1644-4,1742-6,1751-7,17849-1,...,763-3,772-4,777-3,785-6,786-4,787-2,788-0,789-8,804-5,9322-9
count,8358.0,11447.0,490527.0,490523.0,490504.0,530658.0,24986.0,219448.0,146690.0,10073.0,...,75587.0,16564.0,778241.0,747560.0,747921.0,747537.0,746408.0,747568.0,753127.0,18128.0
mean,1.129834,7.479856,-0.090604,136.737019,42.736738,7.379366,167.710318,129.960002,3.178038,2.534686,...,3.046492,13.766542,239.295564,30.220467,33.508648,90.264973,15.799282,3.510249,10.499787,3.920383
std,0.749934,16.189266,5.353903,92.114782,11.388981,0.087115,229.124373,536.810586,0.757973,1.775861,...,6.148193,59.786263,150.321123,2.574024,1.573992,6.987865,2.356128,0.656843,9.006713,2.365156
min,0.02,0.0,-414.0,0.0,0.0,0.0,1.0,0.0,0.9,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8
25%,0.6,0.3,-3.0,81.0,36.0,7.33,85.0,18.0,2.6,1.4,...,0.0,1.0,139.0,28.9,32.5,86.0,14.1,3.05,6.4,2.8
50%,1.0,0.4,0.0,109.0,41.0,7.39,124.0,32.0,3.1,2.1,...,0.0,3.0,215.0,30.3,33.6,90.0,15.3,3.44,9.1,3.5
75%,1.4,4.7,3.0,156.0,47.0,7.44,187.0,69.0,3.7,3.2,...,3.0,8.0,307.0,31.6,34.6,94.0,17.0,3.9,12.8,4.5
max,8.0,575.0,162.0,1914.0,247.0,7.99,12496.0,25460.0,6.9,22.1,...,94.0,2879.0,4504.0,48.9,45.8,147.0,36.4,32.97,846.7,65.4


In [18]:
for code in worth_it:
    print(len(loinc_vals.loc[loinc_vals['loinc_code'] == str(code), 'value'].unique()))

663
843
108
734


In [6]:
# splot to train / validation / test sets
df_train, df_validation, df_test = generate_samples(df_dataset = pts_labels,
                 negative_to_positive_ratio = params.negative_to_positive_ratio,
                 test_set_fraction = params.test_set_fraction,
                 validation_set_fraction = params.validation_set_fraction,
                 random_state = params.random_state)

In [14]:
# persist datasets
df_train.to_csv('data/df_predictor_train.csv')
df_validation.to_csv('data/df_predictor_validation.csv')
df_test.to_csv('data/df_predictor_test.csv')

## Prepare Datasets for Encoder Training

In [1]:
from dataproc.cohort import query_esbl_pts, remove_dups, observation_window
from dataproc.sampling import generate_samples
from hyper_params import HyperParams
import pandas as pd

In [8]:
# load hyperparams instance
params = HyperParams()

In [None]:
# TODO James
# TODO Run the query that loads all patient data into => df_all_patients_data
assert df_all_patients_data

In [6]:
# splot to train / validation / test sets
df_embedding_train, df_embedding_validation, df_embedding_test = generate_samples(df_dataset = df_all_patients_data,
                 negative_to_positive_ratio = params.negative_to_positive_ratio,
                 test_set_fraction = params.test_set_fraction,
                 validation_set_fraction = params.validation_set_fraction,
                 random_state = params.random_state)

In [7]:
df_embedding_train.to_csv('data/df_embedding_train.csv')
df_embedding_validation.to_csv('data/df_embedding_validation.csv')
df_embedding_test.to_csv('data/df_embedding_test.csv')