In [1]:
import os
import copy
import time
import pickle
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
from random import randint

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [2]:
import sys
sys.version

'3.6.13 | packaged by conda-forge | (default, Feb 19 2021, 05:36:01) \n[GCC 9.3.0]'

In [3]:
DATA_SET_TYPE = "mimic" # Either MIMIC or GCUH
TRAINING_VERSION = "0.0.2"

CONFIG_TRAINING_TEMPLATES = {
    "0.0.1": {
        "septic_shock_balance_ratio": 0.75,
        "mimic_data_version": "0.0.4",
        "gcuh_data_version": "0.0.4"
    },
    "0.0.2": {
        "septic_shock_balance_ratio": 0.5,
        "mimic_data_version": "0.0.4",
        "gcuh_data_version": "0.0.5"
    }
}

CONFIG = CONFIG_TRAINING_TEMPLATES[TRAINING_VERSION]

In [39]:
import boto3
from FileHandler import FileHandler
from process_data import process_dataframe, no_padding


#ROLE_ARN = "arn:aws:iam::578915155280:role/DatarweSepsisArdsAdmin"
#ROLE_SESSION_NAME = "DatarweSepsisArdsAdmin"

fh = FileHandler(root='datarwe-ml-data', cache=True)
#fh.assume_role(ROLE_ARN, ROLE_SESSION_NAME)

In [38]:
import importlib
importlib.reload(sys.modules['process_data'])


<module 'process_data' from '/home/ec2-user/SageMaker/data/process_data.py'>

In [5]:
index_filename = f"preprocessed/{DATA_SET_TYPE}/index_{CONFIG['mimic_data_version']}.csv"

index = fh.get_object(index_filename)

In [6]:
index

Unnamed: 0,patientid,septic_shock,set
0,29732,False,train
1,45460,False,train
2,27931,False,train
3,45688,True,train
4,28997,False,train
...,...,...,...
4583,51783,False,val
4584,51384,True,val
4585,58261,True,val
4586,54355,False,val


In [7]:
#feature_cols_adjusted = ['arterial_blood_pressure_systolic', 'arterial_blood_pressure_diastolic', 'arterial_blood_pressure_mean','respiratory_rate', "heart_rate"]
feature_cols_adjusted = ['arterial_blood_pressure_systolic', 'arterial_blood_pressure_diastolic', 'arterial_blood_pressure_mean', 'respiratory_rate', "heart_rate"]
feature_cols = list(feature_cols_adjusted + ["M", "F", "(14, 20]", "(20, 30]", "(30, 40]", "(40, 50]", "(50, 70]", "(70, 90]"])

In [8]:
with open('feature_stats_0.0.1.pkl', 'rb') as f:
    feature_stats = pickle.load(f)

feature_stats["patient_weight"] = {
    "mean": 85.56,
    "std": 31.33
}

means = {}
std = {}
for key in feature_stats.keys():
    if key in feature_cols:
        means[key] = feature_stats[key]["mean"]
        std[key] = feature_stats[key]["std"]

In [63]:
from tqdm import tqdm

#sep = list(set(pd.read_csv("concept_sepsis_query.csv")["patientid"]))

def format_labels(df):
    if df["septic_shock"].any():
        df = df.reset_index(drop=True)
        df = df.iloc[:int(df["septic_shock_onset"].iloc[0])]
    return df

def zscore(df, means, std):
    return (df[means.keys()] - means) / std

def one_hot_encode(df, col, categories=None):
    if categories is not None:
        df[col] = df[col].astype(pd.CategoricalDtype(categories=categories, ordered=True))
    dummies = pd.get_dummies(df[col])
    df = pd.concat([df, dummies], axis=1)
    return df

def get_set(icu_df, means, stds):
    try:
        icu_df = icu_df.sort_values("chart_time")
        icu_df = format_labels(icu_df)
        icu_df[list(means.keys())] = zscore(icu_df, means, stds)
        icu_df = one_hot_encode(icu_df, "gender", categories=["M", "F"])
        icu_df["age_bins"] = pd.cut(icu_df["age"], [14, 20, 30, 40, 50, 70, 90])
        icu_df = one_hot_encode(icu_df, "age_bins")
        icu_df.columns = [str(col) for col in icu_df.columns]
        feature_columns = list(feature_cols_adjusted + ["M", "F", "(14, 20]", "(20, 30]", "(30, 40]", "(40, 50]", "(50, 70]", "(70, 90]"])
        X, y, _ = process_dataframe(icu_df, feature_columns, "septic_shock", 48)
        #X, y = no_padding(icu_df, feature_columns, 'septic_shock')
        #y = y + 1
        return X, y[:, :, 0], feature_columns
    except Exception as e:
        print(f"Error for icu {icu_df.iloc['patientid']}: {e}")

def get_dataframe(patientid):
    df = fh.get_object(f"preprocessed/{DATA_SET_TYPE}/{patientid}_{CONFIG['mimic_data_version']}.csv")
    return df

def get_network_input(patientid):
    X, y, feature_columns = get_set(get_dataframe(patientid), means, std)
    return X, y, feature_columns

def get_network_input_batch(patientids):
    X_list = []
    y_list = []
    for p in patientids:
        X, y, feature_columns = get_network_input(p)
        X_list.append(X)
        y_list.append(y)
    X = np.concatenate(X_list, axis=0)
    y = np.expand_dims(np.concatenate(y_list, axis=0), axis=2)
    return X, y, feature_columns
                       
def get_network_input_batch_gen(patientids, batch_size):
    batch_number = len(patientids) // batch_size
    random.shuffle(patientids)

    for i in range(batch_number + 1):
        batch_start = i * batch_size
        batch_end = i * batch_size + batch_size
        if i == batch_number:
            yield get_network_input_batch(patientids[batch_start:])
        else:
            yield get_network_input_batch(patientids[batch_start: batch_end])

def balance(dataset):
    df_sep = dataset[dataset["septic_shock"] == True]
    df_nosep = dataset[dataset["septic_shock"] == False]

    df_sep_len = len(df_sep)
    df_nosep_len = len(df_nosep)
    init_ratio = df_sep_len / df_nosep_len
    delta_ratio = 1 - init_ratio
    df_sep_add = int(df_nosep_len * delta_ratio)
    df_sep_sample = df_sep.sample(df_sep_add, replace=True)
    dataset = pd.concat([dataset, df_sep_sample])
    return dataset

In [64]:
training_set = index[index["set"] == "train"]


In [65]:
test_patient = index[index["septic_shock"] == True]["patientid"].iloc[2]
df = get_dataframe(test_patient)
X, y, feature_columns = get_network_input(test_patient)

In [66]:
X, y, feature_columns = get_set(df, means, std)

In [67]:
X

array([[[ 0.39107925, -0.15857143,  0.72746442, -0.49432739,
         -0.80518293,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [-0.52489048,  0.34142857,  0.30574591,  0.15397083,
         -1.01859756,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [-0.44524094,  0.34142857,  0.35846073, -0.33225284,
         -1.07957317,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [-0.48506571,  0.05571429,  0.2266737 ,  0.31604538,
         -0.89664634,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          1.        ],
        [-0.44524094,  0.77      ,  0.51660517, -0.17017828,
         -1.04908537,  1.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.   

In [68]:
training_set = index[index["set"] == "train"]
balanced_set = balance(training_set)
train_x, train_y, _ = get_network_input_batch(list(balanced_set["patientid"]))

# the number of training data, the number of records in a patient, the number of features
print('the shape of train_x:', train_x.shape)
print('the shape of train_y:', train_y.shape)

the shape of train_x: (3172, 48, 13)
the shape of train_y: (3172, 48, 1)


In [73]:
new_arr = np.delete(train_x, -1)
new_arr

array([ 0.        , -0.23      ,  0.44227728, ...,  0.        ,
        0.        ,  1.        ])

In [154]:
np.delete(train_x[500][40], np.where(train_x[500][40] == -1))

array([], dtype=float64)

In [153]:
train_x[500][40]

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])

In [175]:
train_x_array = []
train_x_seq = []
for i in range(len(train_x)):
    for j in range(len(train_x[0])):
        train_x_seq.append(np.delete(train_x[i][j], np.where(train_x[i][j] == -1)))
            
    train_x_array.append(train_x_seq)
    train_x_seq = []


In [176]:
len(train_x_array)

3172

In [180]:
(len(train_x_array[3]))

48

In [17]:
training_set = index[index["set"] == "train"]
balanced_set = balance(training_set)
train_x, train_y, _ = get_network_input_batch(list(balanced_set["patientid"]))

# the number of training data, the number of records in a patient, the number of features
print('the shape of train_x:', train_x.shape)
print('the shape of train_y:', train_y.shape)

the shape of train_x: (3172, 48, 13)
the shape of train_y: (3172, 48, 1)


In [15]:
val_set = index[index["set"] == "val"]
val_x, val_y, _ = get_network_input_batch(list(val_set["patientid"]))

# the number of training data, the number of records in a patient, the number of features
print('the shape of val_x:', val_x.shape)
print('the shape of val_y:', val_y.shape)

In [28]:
np.save('save/mimic_train_x', train_x)
np.save('save/mimic_train_y', train_y)

In [33]:
np.save('save/mimic_val_x', val_x)
np.save('save/mimic_val_y', val_y)