We use this script to process the MIMIC-III dataset extracted by Benchmarking_DL_MIMICIII and get the input files for the GRU-D models.

Please set `benchmarking_root_folder` to the base directory of the __Benchmark_MIMIC_III__ project.

In [None]:
import os

import numpy as np

In [None]:
benchmarking_root_folder = os.path.join('..', 'Benchmarking_DL_MIMICIII')   # This should be the [RD] (base directory) of the Benchmark_MIMIC_III project.
grud_working_folder = '.'  # This should be the [WD] (working directory) of the GRU-D project.
hours = 48
benchmarking_data_folder = os.path.join(
    benchmarking_root_folder, 'Data', 'admdata_99p',
    '{}hrs_raw'.format(hours), 'series', 'mv'
)
print('The data files from the benchmarking codebase should be saved here:', benchmarking_data_folder)
print('Files in this folder:', os.listdir(benchmarking_data_folder))

grud_mimic3_data_folder = os.path.join(grud_working_folder, 'data', 'mimic3')
if not os.path.exists(grud_mimic3_data_folder):
    os.makedirs(grud_mimic3_data_folder)
print('The data files will be converted and saved here:', grud_mimic3_data_folder)

In [None]:
# Convert normed-ep.npz to data.npz
raw_data = np.load(os.path.join(benchmarking_data_folder, 'normed-ep.npz'))
timestamp = raw_data['T_t']
processed_data = {
    'input': raw_data['X_t'],
    'masking': raw_data['X_t_mask']
}
n = len(timestamp)

In [None]:
# process input, masking, and timestamp, and keep only the first 48 hours
for i in range(n):
    if (i % 1000 == 0):
        print('.', end='')
    timestamp[i] = np.asarray(timestamp[i]) - timestamp[i][0]  # ensure the first timestamp is always 0
    l = np.sum(timestamp[i] <= hours * 60 * 60)
    timestamp[i] = timestamp[i][:l]
    for k in processed_data.keys():
        processed_data[k][i] = processed_data[k][i][:l]

In [None]:
# process label (in-hospital mortality and ICD-9 categories)
processed_label = {
    'label_mortality': raw_data['y_mor'],
    'label_icd9': raw_data['y_icd9']
}

In [None]:
np.savez_compressed(os.path.join(grud_mimic3_data_folder, 'data.npz'), timestamp=timestamp, **processed_data, **processed_label)

In [None]:
# process fold
fold_data = np.load(os.path.join(benchmarking_data_folder, '5-folds.npz'))

processed_fold = {
    'fold_mortality': fold_data['folds_ep_mor'][0][0],
    'fold_icd9': fold_data['folds_ep_icd9_multi'][0][0]
}

In [None]:
data_stats = np.load(os.path.join(benchmarking_data_folder, 'normed-ep-stdized.npz'))

In [None]:
# process statistics
# n_labeltype, 1, n_fold, (ts, nts), (mean, std)
processed_stats = {
    'mean_mortality': data_stats['folds_ep_mor'][0, 0, :, 0, 0],
    'std_mortality': data_stats['folds_ep_mor'][0, 0, :, 0, 1],
    'mean_icd9': data_stats['folds_ep_icd9_multi'][0, 0, :, 0, 0],
    'std_icd9': data_stats['folds_ep_icd9_multi'][0, 0, :, 0, 1]
}

for k in processed_stats:
    processed_stats[k] = np.repeat(processed_stats[k][:, np.newaxis], 3, axis=-1)

In [None]:
np.savez_compressed(os.path.join(grud_mimic3_data_folder, 'fold.npz'), **processed_fold, **processed_stats)

In [None]:
# size check
data = np.load(os.path.join(grud_mimic3_data_folder, 'data.npz'))
for k in data.keys():
    print(k, data[k].shape)

fold = np.load(os.path.join(grud_mimic3_data_folder, 'fold.npz'))
for k in fold.keys():
    print(k, fold[k].shape)
    for f in fold[k]:
        print('\t', [x.shape for x in f])


In [None]:
print('Done!')