In [1]:
import sys
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

ROOT_PATH = '/home/karina/ehr_ood_detection/'
sys.path.append(ROOT_PATH)

from src.mappings import MAPPING_KEYS
from src.models.info import AVAILABLE_MODELS
from src.utils.datahandler import DataHandler, load_data_from_origin
from src.experiments.plot_results import load_novelty_scores_from_origin

In [2]:
RESULT_DIR = os.path.join(ROOT_PATH, "data/results")
PLOT_DIR = os.path.join(ROOT_PATH,"img/experiments")
STATS_DIR = os.path.join(ROOT_PATH,"data/stats")

SAVE_PATH = os.path.join(RESULT_DIR, "/VUmc/")

DATA_ORIGIN = "VUmc"

### Load VUmc data

In [31]:
print('VUmc Dataset Summary')

for k, v in MAPPING_KEYS["VUmc"].items():
    print('\t', k, ':', v)

print('\n')
for i,j in zip([train_data, test_data, val_data], ["train samples", "test samples", "validation samples"]):
    print('\t', j, ":", len(i))

print('\n')

print('\tnumber of features total: ', train_data.shape[1])
print('\tnumber of features specified: ', len(data_loader['columns_to_use']))
print('\tnumber of features selected: ', train_data[feature_names].shape[1])


VUmc Dataset Summary
	 data_folder : /data/interim/VUmc/df_model_combined.csv
	 feature_names_path : /data/interim/VUmc/MLflow/columns_to_use.pkl
	 target_name : readmission_or_mortality_after_discharge


	 train samples : 12644
	 test samples : 2710
	 validation samples : 2710


	number of features total:  2401
	number of features specified:  1209
	number of features selected:  1196


In [15]:
data_loader = load_data_from_origin(DATA_ORIGIN)
dh = DataHandler(**data_loader)

In [16]:
feature_names = dh.load_feature_names()
y_name = dh.load_target_name()

train_data, test_data, val_data = dh.load_data_splits()

In [17]:
test_data[feature_names]

Unnamed: 0,age,alat__first__overall,alat__is_measured__first_24h,alat__is_measured__last_24h,alat__is_measured__overall,alat__last__overall,alat__maximum__first_24h,alat__maximum__last_24h,alat__maximum__overall,alat__mean__first_24h,...,weight__mean__first_24h,weight__mean__last_24h,weight__mean__overall,weight__mean__overall__diff,weight__minimum__first_24h,weight__minimum__last_24h,weight__minimum__overall,weight__standard_deviation__first_24h,weight__standard_deviation__last_24h,weight__standard_deviation__overall
12776,66.379899,,0.0,0.0,0.0,,,,,,...,70.040000,,70.040000,,70.0,,70.0,0.120000,,0.120000
8728,35.637774,127.0,0.0,0.0,1.0,115.0,,,127.0,,...,90.000000,,90.000000,,90.0,,90.0,0.000000,,0.000000
10988,81.776674,135.0,1.0,0.0,1.0,58.0,135.0,,227.0,124.00,...,70.000000,,70.000000,,70.0,,70.0,0.000000,,0.000000
6797,62.330449,,0.0,0.0,0.0,,,,,,...,109.000000,109.00,109.000000,0.000000,109.0,109.0,109.0,0.000000,0.00,0.000000
9590,39.469092,,0.0,0.0,0.0,,,,,,...,47.900000,,47.925000,,47.7,,47.7,0.141421,,0.129904
6615,57.489022,,0.0,0.0,0.0,,,,,,...,95.000000,95.00,95.000000,0.000000,95.0,95.0,95.0,0.000000,0.00,0.000000
3358,30.854026,21.6,1.0,1.0,1.0,21.6,21.6,21.6,21.6,21.60,...,,,,,,,,,,
7755,62.357742,,0.0,0.0,0.0,,,,,,...,93.000000,93.00,93.000000,0.000000,93.0,93.0,93.0,0.000000,0.00,0.000000
183,76.156613,,0.0,0.0,0.0,,,,,,...,,,,,,,,,,
12877,77.464435,,0.0,0.0,0.0,,,,,,...,75.000000,75.00,75.000000,0.000000,75.0,75.0,75.0,0.000000,0.00,0.000000


### Load novelty scores for VUmc data

In [None]:
def export_novelty_scores():
    

In [22]:
novelty_scores, metrics = load_novelty_scores_from_origin(AVAILABLE_MODELS, RESULT_DIR, DATA_ORIGIN)

novelty_test = dict()

for key in novelty_scores.keys():
    novelty_test[key] = novelty_scores[key]['test']

In [56]:
# Load unscaled novelty scores
novelty_df = pd.DataFrame(novelty_test)
novelty_df.index = test_data[feature_names].index
novelty_df

# Scale scores
scaler = MinMaxScaler()
novelty_df_scaled = novelty_df.copy()
novelty_df_scaled[novelty_df_scaled.columns] = scaler.fit_transform(novelty_df_scaled)
novelty_df_scaled

Unnamed: 0,NN (entropy),NN (max prob),AE (reconstr err),AnchoredNNEnsemble (entropy),AnchoredNNEnsemble (std),AnchoredNNEnsemble (mutual information),VAE (reconstr err),VAE (latent prob),VAE (latent prior prob),VAE (reconstr err grad),PPCA (log prob),NNEnsemble (entropy),NNEnsemble (std),NNEnsemble (mutual information),MCDropout (entropy),MCDropout (std),MCDropout (mutual information),LOF (outlier score)
12776,0.155322,0.045246,0.010294,0.287624,2.372690e-01,1.246045e-01,0.013272,0.824526,0.042016,0.014393,0.014671,0.127097,0.029934,0.010313,0.170021,0.031250,0.880795,0.004791
8728,0.230017,0.075065,0.019418,0.485671,5.903897e-01,4.346072e-01,0.021545,0.790118,0.029450,0.005333,0.035109,0.228033,0.034041,0.006676,0.202234,0.034722,0.649007,0.003625
10988,0.783462,0.468480,0.041073,0.900127,8.830441e-01,7.783715e-01,0.022085,0.860574,0.056346,0.003325,0.060930,0.802536,0.329291,0.116460,0.704671,0.027778,0.211921,0.003027
6797,0.003808,0.000583,0.004120,0.000015,3.783740e-06,1.913513e-06,0.004601,0.787474,0.042662,0.063254,0.007150,0.003972,0.000361,0.000094,0.004823,0.000109,0.415977,0.002378
9590,0.182401,0.055544,0.021146,0.030985,1.767691e-02,9.317846e-03,0.019553,0.853457,0.025660,0.003235,0.033770,0.132045,0.033596,0.011495,0.183230,0.017361,0.337748,0.007043
6615,0.039026,0.008401,0.004133,0.026355,1.045528e-02,5.671450e-03,0.003030,0.822198,0.032523,0.037569,0.005518,0.032511,0.002063,0.000244,0.034199,0.002170,0.180464,0.002533
3358,0.099181,0.025842,0.009087,0.056374,3.631382e-02,1.934159e-02,0.011172,0.756684,0.051357,0.033204,0.008053,0.057505,0.010988,0.003445,0.040449,0.003472,0.844371,0.000558
7755,0.049932,0.011257,0.006955,0.348313,3.252758e-01,1.791265e-01,0.003777,0.818208,0.036511,0.045569,0.005491,0.043079,0.006716,0.001974,0.031694,0.000868,0.903974,0.003212
183,0.163802,0.048407,0.011520,0.286052,1.783719e-01,9.421599e-02,0.013367,0.777818,0.045579,0.013487,0.019635,0.201998,0.034448,0.008595,0.144527,0.006944,0.350993,0.005535
12877,0.032812,0.006848,0.003871,0.000509,1.592282e-04,8.225351e-05,0.005703,0.816414,0.028602,0.058875,0.005148,0.024505,0.003530,0.001122,0.021312,0.001736,0.110099,0.001372


In [68]:
vumc_novelty_test = novelty_df.to_csv(index=True)
vumc_novelty_test_scaled = novelty_df_scaled.to_csv(index=True)

novelty_df.to_csv(r'/home/karina/ehr_ood_detection/data/results/VUmc/vumc_novelty_test.csv')
novelty_df_scaled.to_csv(r'/home/karina/ehr_ood_detection/data/results/VUmc/vumc_novelty_test_scaled.csv')

# vumc_novelty_test_scaled.to_csv(os.path.join(SAVE_PATH, 'vumc_novelty_test_scaled.csv'))
                  