In [None]:
import os
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, precision_recall_curve
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers

import tensorflow as tf
#tf.compat.v1.disable_v2_behavior()
#%load_ext tensorboard

from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import concatenate,Dense
from keras.models import Model,load_model
from tensorflow.keras import regularizers

from pathlib import Path
from directories import *
import numpy as np
from parameters_config import Config
import matplotlib.pyplot as plt
import seaborn as sns

print(BASE_DIRECTORY.absolute())

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold

##set random seeds
np.random.seed(7)
tf.random.set_seed(seed=7)


In [None]:
NA_removal_threshold = 80  ##atleast this many columns should have a non null value
test_df_control_ratio = None

BASE_PATH='/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/'

# SET COHORT PATH

#with extra BP data
DATA_FILE_PATH = "/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/only_extra_BP/"

#without extra BP data
#DATA_FILE_PATH = "/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/New/Dynamic_features/without_extra_BP_and_without_nan_numeric/"

DATA_UNSUPERVISED_PATH = "/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/Static_features/"

MRN_PATH = "/home/kiwitn01/master_thesis_hypertension-complications/Case_Control_Cohort_Creation/For_ML_Pipeline/Split_2012/All3_ML_pipeline_final.pkl"

pd.set_option('display.max_columns', 500)
Config.VOCAB_SIZE= 524 # number of unique concepts

In [None]:
def load_sequence_data(df_mrn):
    with open(DATA_FILE_PATH+"data_all3_2012.txt", "rb") as fp:
        X = pickle.load(fp)

    df = df_mrn.merge(pd.DataFrame(X, columns=['medical_record_number','sequence']),how='right',on='medical_record_number')

    return df[['medical_record_number', 'sequence', 'Complication']]


def load_unsupervised_data(path):
    df = pd.read_pickle(path+'All3_2012_Drug_Diag_Static.pkl')
    #for only extra BP data!
    df = df[['Complication', 'train_test', 'marital_status_code', 'medical_record_number']]
    df = df.drop(['train_test'], axis=1)
    df = df.drop(['marital_status_code'], axis = 1)
    return df

def train_test_split_custom(df, test_df_control):
    train_df = df[df.train_test == 'train']
    test_df = df[df.train_test == 'test']

    if test_df_control_ratio is not None:
        test_df_ht = test_df[test_df.Complication == '1']
        test_df_not_ht = test_df[test_df.Complication == '0']
        test_df_not_ht = test_df_not_ht.sample(test_df_control_ratio * test_df_ht.shape[0], random_state=42)
        test_df = pd.concat([test_df_ht,test_df_not_ht])
    
    mrn_train = set(train_df['medical_record_number'])
    mrn_test = set(test_df['medical_record_number'])
    train_test_common = mrn_train.intersection(mrn_test)
    train_df = train_df.drop(columns=['train_test'])
    test_df = test_df.drop(columns=['train_test'])
    return train_df, test_df

def filter_by_mrn(df,mrn_list):
    return df[df.medical_record_number.isin(mrn_list)]



In [None]:
from catboost import CatBoostClassifier
import category_encoders as ce
from category_encoders import *
from category_encoders.helmert import HelmertEncoder
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer


def impute_unsupervised_data(train_df, test_df):
    categorical_cols = [c for c in train_df.columns if train_df[c].dtype in [np.object] and c not in ['Complication']]
    numerical_cols = [c for c in train_df.columns if train_df[c].dtype in [np.float, np.int] and c not in ['Complication']]
    print("Number of categorical features " + str(len(categorical_cols)) + " and number of numerical features "+ str(len(numerical_cols)))
    
    ct =  Pipeline([('ct',
        ColumnTransformer([
        ('num', StandardScaler(), numerical_cols),
        #('cat', HelmertEncoder(drop_invariant = True, handle_missing = 'return_nan'), categorical_cols),])),
        #('cat', CatBoostEncoder(drop_invariant = True, handle_missing = 'return_nan', sigma = 0.5, a = 0.7), categorical_cols),])),
        ('cat', OneHotEncoder(drop_invariant = True, handle_missing = 'return_nan'), categorical_cols),])),

        #('iterativeimpute' , IterativeImputer(random_state=42, max_iter = 50, n_nearest_features = 5)),
        ('simpleimputer' , SimpleImputer(missing_values=np.nan, strategy='mean')),
        #('KNN' , KNNImputer(n_neighbors =2)),
        ])

    fitted_train= ct.fit(train_df,pd.to_numeric(train_labels))
    
    train_df = fitted_train.transform(train_df)
    test_df = fitted_train.transform(test_df)
    
    return train_df,test_df

In [None]:
from keras.layers import Masking

def train_lstm_model(train_sequence_data, train_unsupervised_data, train_labels):

    train_sequence_data, val_sequence_data, train_unsupervised_data, val_unsupervised_data, train_labels , val_labels = train_test_split(train_sequence_data, train_unsupervised_data, train_labels, test_size = 0.3, random_state = 42)

    print(train_sequence_data.shape)
    print(val_sequence_data.shape)
    print(train_labels.shape)
    print(val_labels.shape)

    main_input = keras.Input(shape=(train_sequence_data.shape[1],), name='main_input') # dtype='int32'

    # This embedding layer will encode the input sequence
    # into a sequence of dense 512-dimensional vectors.
    x = layers.Embedding(Config.VOCAB_SIZE, Config.EMBEDDING_DIM, input_length=Config.MAX_REVIEW_LENGTH, name='Embedding_1')(main_input)
    
    mask= layers.Masking(mask_value = -100, name = 'mask')(x)
    
    # A LSTM will transform the vector sequence into a single vector,
    # containing information about the entire sequence
    lstm_1 = layers.LSTM(100, name="lstm_1", dropout=0.5, return_sequences=True)(mask)
    lstm_2 = layers.LSTM(100, name='lstm_2', dropout=0.5, return_sequences=True)(lstm_1)
    lstm_out = layers.LSTM(100, name='lstm_out', dropout=0.5)(lstm_2)
    
    aux_input=keras.Input(shape=(train_unsupervised_data.shape[1],),name='aux_input')
    
    # We concatenate the lstm output to auxillary input
    x = concatenate([lstm_out, aux_input])
    
    dense_1 = Dense(100, activation='sigmoid', name='dense_1')(x)
    
    # And finally we add the main logistic regression layer
    main_output = Dense(1, activation='sigmoid', name='main_output')(dense_1)
    
    model = Model(inputs=[main_input, aux_input], outputs=[main_output])
    print(model.summary())
    
    # compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=Config.METRICS)

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_auc', 
        verbose=1,
        patience=20,
        mode='max',
        restore_best_weights=True)
    
    try:
        with tf.device('/device:GPU:2'):
            history = model.fit({'main_input': train_sequence_data, 'aux_input': train_unsupervised_data},
                {'main_output': train_labels},
                epochs=200,
                validation_data=([val_sequence_data,val_unsupervised_data],val_labels),
                batch_size=None, verbose=1, callbacks=[early_stopping], shuffle = True) 
    except RuntimeError as e:
        print(e)
        
    
    val_predictions = model.predict([val_sequence_data,val_unsupervised_data])
    val_auprc = average_precision_score(val_labels, val_predictions, average='micro', pos_label = 1)
    val_auc = roc_auc_score(val_labels, val_predictions, average='micro')
    val_f1 = f1_score(val_labels, np.where(val_predictions > 0.5, 1, 0), average='micro')
    print("val AUPRC is :: " + str(val_auprc))
    print("val AUC is :: " + str(val_auc))
    print("val F1 is ::" + str(val_f1))

 
    epch = early_stopping.stopped_epoch

    validation_score = [history.history['val_loss'][epch],
        history.history['val_tp'][epch],
        history.history['val_fp'][epch],
        history.history['val_tn'][epch],
        history.history['val_fn'][epch],
        history.history['val_accuracy'][epch],
        history.history['val_precision'][epch],
        history.history['val_recall'][epch],
        history.history['val_auc'][epch],
        history.history['val_f1_score'][epch],
        history.history['val_average_precision'][epch]]
    
    print('Validation Score:',validation_score)

    return model

In [None]:
#building a CNN model

from keras.models import Sequential
from keras.layers import Dense, Conv2D, Conv1D, Flatten
from keras.layers import Dense, Activation, Flatten, Convolution1D, Dropout, TimeDistributed

def train_lstm_model(train_sequence_data, train_unsupervised_data, train_labels):

    train_sequence_data, val_sequence_data, train_unsupervised_data, val_unsupervised_data, train_labels , val_labels = train_test_split(train_sequence_data, train_unsupervised_data, train_labels, test_size = 0.3, random_state = 42)

    print(train_sequence_data.shape)
    print(val_sequence_data.shape)
    print(train_labels.shape)
    print(val_labels.shape)
      

    #first input layer of sequence, categorical data
    main_input = keras.Input(shape=(train_sequence_data.shape[1],), name='main_input') # dtype='int32'

    # This embedding layer will encode the input sequence into a sequence of dense 512-dimensional vectors.
    x = layers.Embedding(Config.VOCAB_SIZE, Config.EMBEDDING_DIM, input_length=Config.MAX_REVIEW_LENGTH, name='Embedding_1')(main_input)
    
    #add convolutional layer
    convolution_1 = tf.keras.layers.Conv1D(100, 2, activation="relu")(x)
    pooling1 = tf.keras.layers.MaxPooling1D(pool_size=2)(convolution_1)

    # A LSTM will transform the vector sequence into a single vector,
    # containing information about the entire sequence
    lstm_out = layers.LSTM(100, name='lstm_3', dropout=0.5)(pooling1)
    
    
    #add second input layer
    aux_input=keras.Input(shape=(train_unsupervised_data.shape[1],),name='aux_input')
    
    
    # We concatenate the lstm output to auxillary input
    x = concatenate([lstm_out, aux_input])
    
    flatten = tf.keras.layers.Flatten()(x)
    
    dense_1 = Dense(100, activation='sigmoid', name='dense_1')(flatten)
    # And finally we add the main logistic regression layer
    main_output = Dense(1, activation='sigmoid', name='main_output')(dense_1)
    
    model = Model(inputs=[main_input, aux_input], outputs=[main_output])
    print(model.summary())
    
    # compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=Config.METRICS)

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_auc', 
        verbose=1,
        patience=20,
        mode='max',
        restore_best_weights=True)
    
    try:
        with tf.device('/device:GPU:2'):
            history = model.fit({'main_input': train_sequence_data, 'aux_input': train_unsupervised_data},
                {'main_output': train_labels},
                epochs=200,
                validation_data=([val_sequence_data,val_unsupervised_data],val_labels),
                batch_size=None, verbose=1, callbacks=[early_stopping], shuffle = True) 
    except RuntimeError as e:
        print(e)
        
    
    val_predictions = model.predict([val_sequence_data,val_unsupervised_data])
    val_auprc = average_precision_score(val_labels, val_predictions, average='micro', pos_label = 1)
    val_auc = roc_auc_score(val_labels, val_predictions, average='micro')
    val_f1 = f1_score(val_labels, np.where(val_predictions > 0.5, 1, 0), average='micro')
    print("val AUPRC is :: " + str(val_auprc))
    print("val AUC is :: " + str(val_auc))
    print("val F1 is ::" + str(val_f1))

 
    epch = early_stopping.stopped_epoch

    validation_score = [history.history['val_loss'][epch],
        history.history['val_tp'][epch],
        history.history['val_fp'][epch],
        history.history['val_tn'][epch],
        history.history['val_fn'][epch],
        history.history['val_accuracy'][epch],
        history.history['val_precision'][epch],
        history.history['val_recall'][epch],
        history.history['val_auc'][epch],
        history.history['val_f1_score'][epch],
        history.history['val_average_precision'][epch]]
    
    print('Validation Score:',validation_score)

    return model

In [None]:
#old less layers

def train_lstm_model(train_sequence_data, train_unsupervised_data, train_labels):

    train_sequence_data, val_sequence_data, train_unsupervised_data, val_unsupervised_data, train_labels , val_labels = train_test_split(train_sequence_data, train_unsupervised_data, train_labels, test_size = 0.3, random_state = 42)

    print(train_sequence_data.shape)
    print(val_sequence_data.shape)
    print(train_labels.shape)
    print(val_labels.shape)

    main_input = keras.Input(shape=(train_sequence_data.shape[1],), name='main_input') # dtype='int32'

    # This embedding layer will encode the input sequence
    # into a sequence of dense 512-dimensional vectors.
    x = layers.Embedding(Config.VOCAB_SIZE, Config.EMBEDDING_DIM, input_length=Config.MAX_REVIEW_LENGTH, name='Embedding_1')(main_input)
    
    # A LSTM will transform the vector sequence into a single vector,
    # containing information about the entire sequence
    lstm_out = layers.LSTM(100, name='lstm_1', dropout=0.3)(x)
    aux_input=keras.Input(shape=(train_unsupervised_data.shape[1],),name='aux_input')
    
    # We concatenate the lstm output to auxillary input
    x = concatenate([lstm_out, aux_input])
    
    drop_out = tf.keras.layers.Dropout(0.3) (x)
    dense_1 = Dense(100, activation='sigmoid',
                    kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01),
                    bias_regularizer=regularizers.l2(0.02),
                    activity_regularizer=regularizers.l2(0.02), name='dense_1')(drop_out)
    drop_out2 = tf.keras.layers.Dropout(0.3) (dense_1)
    
    # And finally we add the main logistic regression layer
    main_output = Dense(1, activation='sigmoid', name='main_output')(drop_out2)
    
    model = Model(inputs=[main_input, aux_input], outputs=[main_output])
    print(model.summary())
    
    # compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=Config.METRICS)

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_auc', 
        verbose=1,
        patience=20,
        mode='max',
        restore_best_weights=True)
    
    try:
        with tf.device('/device:GPU:2'):
            history = model.fit({'main_input': train_sequence_data, 'aux_input': train_unsupervised_data},
                {'main_output': train_labels},
                epochs=1,
                validation_data=([val_sequence_data,val_unsupervised_data],val_labels),
                batch_size=None, verbose=1, callbacks=[early_stopping], shuffle = True) 
    except RuntimeError as e:
        print(e)
        
    
    val_predictions = model.predict([val_sequence_data,val_unsupervised_data])
    val_auprc = average_precision_score(val_labels, val_predictions, average='micro', pos_label = 1)
    try:
        val_auc = roc_auc_score(val_labels, val_predictions, average='micro')
    except ValueError:
        pass
    val_f1 = f1_score(val_labels, np.where(val_predictions > 0.5, 1, 0), average='micro')
    print("val AUPRC is :: " + str(val_auprc))
    print("val AUC is :: " + str(val_auc))
    print("val F1 is ::" + str(val_f1))

 
    epch = early_stopping.stopped_epoch

    validation_score = [history.history['val_loss'][epch],
        history.history['val_tp'][epch],
        history.history['val_fp'][epch],
        history.history['val_tn'][epch],
        history.history['val_fn'][epch],
        history.history['val_accuracy'][epch],
        history.history['val_precision'][epch],
        history.history['val_recall'][epch],
        history.history['val_auc'][epch],
        history.history['val_f1_score'][epch],
        history.history['val_average_precision'][epch]]
    
    print('Validation Score:',validation_score)

    return model

In [None]:
def evaluate_lstm_model(model, test_sequence_data, test_labels):
    results = model.evaluate([test_sequence_data], test_labels, batch_size=256, verbose=1)
    print(model.metrics_names)
    print(results)
    test_predictions = model.predict([test_sequence_data])
    test_auprc = average_precision_score(test_labels, test_predictions, average='micro', pos_label = 1)
    test_auc = roc_auc_score(test_labels, test_predictions, average='micro')
    test_f1 = f1_score(test_labels, np.where(test_predictions > 0.5, 1, 0), average='micro')

    precision, recall, thresholds = precision_recall_curve(test_labels, test_predictions)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f ' % (thresholds[ix], fscore[ix], precision[ix], recall[ix]))

    print("test AUPRC is :: " + str(test_auprc))
    print("test AUC is :: " + str(test_auc))
    print("test F1 is ::" + str(test_f1))

In [None]:
if __name__ == "__main__":

    ##load train test mrns
    df_mrn = pd.read_pickle(MRN_PATH)

    #load sequence data
    sequence_data = load_sequence_data(df_mrn)
    sequence_data = sequence_data.sort_values(by=['medical_record_number'])
    
    df_mrn = df_mrn.dropna(axis=0, thresh = NA_removal_threshold)
    print("final dataframe shape after dropping NAs" + str(df_mrn.shape))
    
    MRN_train_df, MRN_test_df = train_test_split_custom(df_mrn, test_df_control_ratio)
    train_mrn = list(MRN_train_df.medical_record_number)
    test_mrn = list(MRN_test_df.medical_record_number)


    ##filter by mrns
    train_sequence_data = filter_by_mrn(sequence_data, train_mrn)
    test_sequence_data = filter_by_mrn(sequence_data, test_mrn)

    train_labels_seq = train_sequence_data.pop('Complication').astype('int')
    test_labels_seq = test_sequence_data.pop('Complication').astype('int')

    print(train_labels_seq.value_counts())
    print(test_labels_seq.value_counts())

    train_mrn_seq = train_sequence_data.pop('medical_record_number').astype('int')
    test_mrn_seq = test_sequence_data.pop('medical_record_number').astype('int')
    
    #matching unsupervised & sequence data 
    train_mrn_seq_match = train_mrn_seq.map(str)
    test_mrn_seq_match = test_mrn_seq.map(str)

    train_mrn_seq_match = train_mrn_seq_match.tolist()
    test_mrn_seq_match = test_mrn_seq_match.tolist()

    ##load auxilliary data
    unsupervised_data = load_unsupervised_data(DATA_UNSUPERVISED_PATH)
    unsupervised_data = unsupervised_data.sort_values(by=['medical_record_number'])
    
    #
    train_unsupervised_data = filter_by_mrn(unsupervised_data, train_mrn_seq_match)
    test_unsupervised_data = filter_by_mrn(unsupervised_data, test_mrn_seq_match)


    train_labels = train_unsupervised_data.pop('Complication').astype('int')
    test_labels = test_unsupervised_data.pop('Complication').astype('int')

    train_mrn_unsupervised = train_unsupervised_data.pop('medical_record_number').astype('int')
    test_mrn_unsupervised = test_unsupervised_data.pop('medical_record_number').astype('int')
    
    ##impute unsupervised data
    #train_unsupervised_data, test_unsupervised_data = impute_unsupervised_data(train_unsupervised_data,test_unsupervised_data)

    ##pad the sequences
    train_sequence_data = np.asarray(train_sequence_data['sequence'])
    test_sequence_data = np.asarray(test_sequence_data['sequence'])
    train_sequence_data = tf.keras.preprocessing.sequence.pad_sequences(train_sequence_data, maxlen=150, padding='pre', truncating='pre')
    test_sequence_data = tf.keras.preprocessing.sequence.pad_sequences(test_sequence_data, maxlen=150, padding='pre', truncating='pre')
    print('After padding the sequence with the longest length the shape is:',train_sequence_data.shape)
    print(train_sequence_data.max())
    print(test_sequence_data.max())


    ##sanity checks
    print("check if train labels are same")
    assert np.array_equal(train_labels_seq.to_numpy(), train_labels.to_numpy())

    print("check if test labels are same")
    assert np.array_equal(test_labels_seq.to_numpy(), test_labels.to_numpy())

    print("check if train mrns are same")
    assert np.array_equal(train_mrn_seq.to_numpy(), train_mrn_unsupervised.to_numpy())

    print("check if test mrns are same")
    assert np.array_equal(test_mrn_seq.to_numpy(), test_mrn_unsupervised.to_numpy())

    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        print("Name:", gpu.name, "  Type:", gpu.device_type)

    model = train_lstm_model(train_sequence_data, train_unsupervised_data, train_labels_seq)
    results = evaluate_lstm_model(model, test_sequence_data, test_unsupervised_data, test_labels_seq)


In [None]:
###FOR SHAP####

##############################################

##load train test mrns
df_mrn = pd.read_pickle(MRN_PATH)

##load sequence data
sequence_data = load_sequence_data(df_mrn)
sequence_data = sequence_data.sort_values(by=['medical_record_number'])

df_mrn = df_mrn.dropna(axis=0, thresh = NA_removal_threshold)
print("final dataframe shape after dropping NAs" + str(df_mrn.shape))

MRN_train_df, MRN_test_df = train_test_split_custom(df_mrn, test_df_control_ratio)
train_mrn = list(MRN_train_df.medical_record_number)
test_mrn = list(MRN_test_df.medical_record_number)


    ##filter by mrns only those taken relevant for the cohort
    
train_features_sequence = filter_by_mrn(sequence_data, train_mrn)
test_features_sequence = filter_by_mrn(sequence_data, test_mrn)

train_labels_seq = train_features_sequence.pop('Complication').astype('int')
test_labels_seq = test_features_sequence.pop('Complication').astype('int')

print(train_labels_seq.value_counts())
print(test_labels_seq.value_counts())

train_mrn_seq = train_features_sequence.pop('medical_record_number').astype('int')
test_mrn_seq = test_features_sequence.pop('medical_record_number').astype('int')
    
#matching unsupervised & sequence data 
train_mrn_seq_match = train_mrn_seq.map(str)
test_mrn_seq_match = test_mrn_seq.map(str)

train_mrn_seq_match = train_mrn_seq_match.tolist()
test_mrn_seq_match = test_mrn_seq_match.tolist()

    ##load auxilliary data
unsupervised_data = load_unsupervised_data(DATA_UNSUPERVISED_PATH)
unsupervised_data = unsupervised_data.sort_values(by=['medical_record_number'])
    
    #
train_features_unsupervised = filter_by_mrn(unsupervised_data, train_mrn_seq_match)
test_features_unsupervised = filter_by_mrn(unsupervised_data, test_mrn_seq_match)


train_labels = train_features_unsupervised.pop('Complication').astype('int')
test_labels = test_features_unsupervised.pop('Complication').astype('int')

train_mrn_unsupervised = train_features_unsupervised.pop('medical_record_number').astype('int')
test_mrn_unsupervised = test_features_unsupervised.pop('medical_record_number').astype('int')

    # drop 'Diagnosis__ICD-9__414.00', & 'Diagnosis__ICD-9__414.01', -> are being dropped by columntransformer & pipeline
train_features_unsupervised = train_features_unsupervised.drop(['Diagnosis__ICD-9__414.00', 'Diagnosis__ICD-9__414.01'], axis = 1) 
test_features_unsupervised = test_features_unsupervised.drop(['Diagnosis__ICD-9__414.00', 'Diagnosis__ICD-9__414.01'], axis = 1)

    ##impute unsupervised data
train_features_unsupervised, test_features_unsupervised = impute_unsupervised_data(train_features_unsupervised,test_features_unsupervised)

    ##pad the sequences
train_features_sequence = np.asarray(train_features_sequence['sequence'])
test_features_sequence = np.asarray(test_features_sequence['sequence'])
train_features_sequence = tf.keras.preprocessing.sequence.pad_sequences(train_features_sequence, maxlen=Config.MAX_REVIEW_LENGTH, padding='pre', truncating='pre')
test_features_sequence = tf.keras.preprocessing.sequence.pad_sequences(test_features_sequence, maxlen=Config.MAX_REVIEW_LENGTH, padding='pre', truncating='pre')
print('After padding the sequence with the longest length the shape is:',train_features_sequence.shape)
print(train_features_sequence.max())
print(test_features_sequence.max())

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

    ##sanity checks
print("check if train labels are same")
assert np.array_equal(train_labels_seq.to_numpy(), train_labels.to_numpy())

print("check if test labels are same")
assert np.array_equal(test_labels_seq.to_numpy(), test_labels.to_numpy())

print("check if train mrns are same")
assert np.array_equal(train_mrn_seq.to_numpy(), train_mrn_unsupervised.to_numpy())

print("check if test mrns are same")
assert np.array_equal(test_mrn_seq.to_numpy(), test_mrn_unsupervised.to_numpy())



#################################
train_features_sequence, val_features_sequence, train_features_unsupervised, val_features_unsupervised, train_labels , val_labels = train_test_split(train_features_sequence, train_features_unsupervised, train_labels, test_size = 0.3, random_state = 42)

print(train_features_sequence.shape)
print(val_features_sequence.shape)
print(train_labels.shape)
print(val_labels.shape)

main_input = keras.Input(shape=(train_features_sequence.shape[1],), name='main_input') # dtype='int32'

    # This embedding layer will encode the input sequence
    # into a sequence of dense 512-dimensional vectors.
x = layers.Embedding(Config.VOCAB_SIZE, Config.EMBEDDING_DIM, input_length=Config.MAX_REVIEW_LENGTH, name='Embedding_1')(main_input)
    
    # A LSTM will transform the vector sequence into a single vector,
    # containing information about the entire sequence
lstm_1 = layers.LSTM(100, name="lstm_1", dropout=0.3, return_sequences=True)(x)
lstm_2 = layers.LSTM(100, name='lstm_2', dropout=0.3, return_sequences=True)(lstm_1)
lstm_out = layers.LSTM(100, name='lstm_out', dropout=0.3)(lstm_2)
    
aux_input=keras.Input(shape=(train_features_unsupervised.shape[1],),name='aux_input')
    
# We concatenate the lstm output to auxillary input
concat = concatenate([lstm_out, aux_input])

dense_1 = Dense(100, activation='sigmoid',
                kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01),
                bias_regularizer=regularizers.l2(0.02),
                activity_regularizer=regularizers.l2(0.02), name='dense_1')(concat)

#drop_out2 = tf.keras.layers.Dropout(0.3) (dense_1)

#dense_2 = Dense(100, activation='sigmoid', name='dense_2')(drop_out2)
    
# And finally we add the main logistic regression layer
main_output = Dense(1, activation='sigmoid', name='main_output')(dense_1)
    
model = Model(inputs=[main_input, aux_input], outputs=[main_output])


print(model.summary())
    
    # compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(lr=1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=Config.METRICS)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=20,
    mode='max',
    restore_best_weights=True)
    
try:
    with tf.device('/device:GPU:2'):
        history = model.fit({'main_input': train_features_sequence, 'aux_input': train_features_unsupervised},
            {'main_output': train_labels},
            epochs=200,
            validation_data=([val_features_sequence,val_features_unsupervised],val_labels),
            batch_size=None, verbose=1, callbacks=[early_stopping], shuffle = True) 
except RuntimeError as e:
    print(e)
        
    
val_predictions = model.predict([val_features_sequence,val_features_unsupervised])
val_auprc = average_precision_score(val_labels, val_predictions, average='micro', pos_label = 1)
val_auc = roc_auc_score(val_labels, val_predictions, average='micro')
val_f1 = f1_score(val_labels, np.where(val_predictions > 0.5, 1, 0), average='micro')
print("val AUPRC is :: " + str(val_auprc))
print("val AUC is :: " + str(val_auc))
print("val F1 is ::" + str(val_f1))

 
epch = early_stopping.stopped_epoch

validation_score = [history.history['val_loss'][epch],
    history.history['val_tp'][epch],
    history.history['val_fp'][epch],
    history.history['val_tn'][epch],
    history.history['val_fn'][epch],
    history.history['val_accuracy'][epch],
    history.history['val_precision'][epch],
    history.history['val_recall'][epch],
    history.history['val_auc'][epch],
    history.history['val_f1_score'][epch],
    history.history['val_average_precision'][epch]]
    
print('Validation Score:',validation_score)



##########

results = model.evaluate([test_features_sequence,test_features_unsupervised],
                         test_labels, batch_size=256, verbose=1)

print(model.metrics_names)
print(results)
test_predictions = model.predict([test_features_sequence,test_features_unsupervised])
test_auprc = average_precision_score(test_labels, test_predictions, average='micro', pos_label = 1)
test_auc = roc_auc_score(test_labels, test_predictions, average='micro')
test_f1 = f1_score(test_labels, np.where(test_predictions > 0.5, 1, 0), average='micro')

precision, recall, thresholds = precision_recall_curve(test_labels, test_predictions)
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f ' % (thresholds[ix], fscore[ix], precision[ix], recall[ix]))

print("test AUPRC is :: " + str(test_auprc))
print("test AUC is :: " + str(test_auc))
print("test F1 is ::" + str(test_f1))

#model = train_lstm_model(train_sequence_data, train_unsupervised_data, train_labels_seq)
#results = evaluate_lstm_model(model, test_sequence_data, test_unsupervised_data, test_labels_seq)


In [None]:
print(train_features_sequence.shape)
print(train_features_unsupervised.shape)
print(train_labels.shape)

In [None]:
#### SHAP #### 

In [None]:
#pip install h5py==2.10.0
#conda install -c conda-forge hdf5=2.10.0

# Save the baseline model
model.save('Baseline_model_all3_FINAL.h5')

In [None]:
loaded_model = tf.keras.models.load_model('Baseline_model_all3_FINAL.h5', 
                custom_objects={"f1_score":Config.f1_score, 
              "average_precision":Config.average_precision}
)

print(loaded_model.summary())

In [None]:
#SHAP Feature Importance#

import shap

explainer = shap.DeepExplainer(loaded_model, [train_features_sequence[0:1000,:],np.asarray(train_features_unsupervised)[0:1000,:]])
shap_values = explainer.shap_values([test_features_sequence[0:1000,:],np.asarray(test_features_unsupervised)[0:1000,:]])


In [None]:
print(shap_values[0][0].shape) # For sequence data
print(shap_values[0][1].shape) # For unsupervised data

In [None]:
# For sequence data
# Fetching unique concepts for sequence data
unique_concepts = pd.read_csv('/home/kiwitn01/master_thesis_hypertension-complications/Time_Series/unique_concepts_30_days_no_numeric.csv')
unique_concepts
unique_concepts.columns =[ 'name', 'code'] 
concepts_dictionary = unique_concepts.to_dict('dict')
concepts_dictionary = concepts_dictionary['name']
concepts_dictionary = dict(zip(concepts_dictionary.values(), concepts_dictionary.keys()))
concepts_dictionary 

In [None]:
# Creating the number to text mapping for the selected sequence data with '0' as 'NONE'
num2word = {}
for w in concepts_dictionary.keys():
    num2word[concepts_dictionary[w]] = w
x_test_words = np.stack([np.array(list(map(lambda x: num2word.get(x, "NONE"), test_features_sequence[i]))) for i in range(100)])



In [None]:
print(x_test_words.shape)
print(shap_values[0][0].shape)

In [None]:
# Collecting feature importance based on the shap values
shap_values_dict = {}
for i in range(0,x_test_words.shape[0]):
    for j in range(0,x_test_words.shape[1]):
        shap_list=[]
        if x_test_words[i][j] not in ['NONE']:
            if x_test_words[i][j] in shap_values_dict.keys():
                shap_values_dict[x_test_words[i][j]].append(shap_values[0][0][i][j])
            else:
                shap_list.append(shap_values[0][0][i][j])
                shap_values_dict[x_test_words[i][j]] = shap_list

In [None]:
# Averaging the feature importance
from functools import reduce
sequence_feature_importance_values = {}
for key in shap_values_dict.keys():
    shap_vals = shap_values_dict[key]
    #abs_list = [abs(ele) for ele in shap_vals]
    sum_list = [sum((ele) for ele in shap_vals)]
    
    average_val = reduce(lambda x, y:x+y, sum_list)/len(sum_list)
    sequence_feature_importance_values[key] = average_val

In [None]:
# This contains the most important sequence features
len(sequence_feature_importance_values)

In [None]:
from collections import OrderedDict

sorted_dict = dict(sorted(sequence_feature_importance_values.items(), key=lambda x: x[1], reverse=True))
sorted_dict 

In [None]:
# Plotting the feature importance

import seaborn as sns

sequence_importance = pd.DataFrame.from_dict(sorted_dict, orient='index',columns=['rank'])
sequence_importance.reset_index(inplace=True)
sequence_importance.rename(columns={"index": "name"}, inplace=True)
top_10 = sequence_importance.head(20)

# print(top_10)
ax = sns.barplot(x="rank", y="name", data=top_10)

In [None]:
#####

In [None]:
# For aggregated data
print(shap_values[0][1].shape)


In [None]:
#get columns back
##load train test mrns
df_mrn = pd.read_pickle(MRN_PATH)

##load sequence data
sequence_data = load_sequence_data(df_mrn)
sequence_data = sequence_data.sort_values(by=['medical_record_number'])

df_mrn = df_mrn.dropna(axis=0, thresh = NA_removal_threshold)
print("final dataframe shape after dropping NAs" + str(df_mrn.shape))

MRN_train_df, MRN_test_df = train_test_split_custom(df_mrn, test_df_control_ratio)
train_mrn = list(MRN_train_df.medical_record_number)
test_mrn = list(MRN_test_df.medical_record_number)


    ##filter by mrns only those taken relevant for the cohort
    
train_features_sequence = filter_by_mrn(sequence_data, train_mrn)
test_features_sequence = filter_by_mrn(sequence_data, test_mrn)

train_labels_seq = train_features_sequence.pop('Complication').astype('int')
test_labels_seq = test_features_sequence.pop('Complication').astype('int')

print(train_labels_seq.value_counts())
print(test_labels_seq.value_counts())

train_mrn_seq = train_features_sequence.pop('medical_record_number').astype('int')
test_mrn_seq = test_features_sequence.pop('medical_record_number').astype('int')
    
#matching unsupervised & sequence data 
train_mrn_seq_match = train_mrn_seq.map(str)
test_mrn_seq_match = test_mrn_seq.map(str)

train_mrn_seq_match = train_mrn_seq_match.tolist()
test_mrn_seq_match = test_mrn_seq_match.tolist()

    ##load auxilliary data
unsupervised_data = load_unsupervised_data(DATA_UNSUPERVISED_PATH)
unsupervised_data = unsupervised_data.sort_values(by=['medical_record_number'])
    
    #
train_features_unsupervised = filter_by_mrn(unsupervised_data, train_mrn_seq_match)
test_features_unsupervised = filter_by_mrn(unsupervised_data, test_mrn_seq_match)


train_labels = train_features_unsupervised.pop('Complication').astype('int')
test_labels = test_features_unsupervised.pop('Complication').astype('int')

train_mrn_unsupervised = train_features_unsupervised.pop('medical_record_number').astype('int')
test_mrn_unsupervised = test_features_unsupervised.pop('medical_record_number').astype('int')

    # drop 'Diagnosis__ICD-9__414.00', & 'Diagnosis__ICD-9__414.01', -> are being dropped by columntransformer & pipeline
train_features_unsupervised = train_features_unsupervised.drop(['Diagnosis__ICD-9__414.00', 'Diagnosis__ICD-9__414.01'], axis = 1) 
test_features_unsupervised = test_features_unsupervised.drop(['Diagnosis__ICD-9__414.00', 'Diagnosis__ICD-9__414.01'], axis = 1)

len(train_features_unsupervised.columns)

shap_values_df = pd.DataFrame(shap_values[0][1], columns = train_features_unsupervised.columns) 

In [None]:
shap_vals = {}
for i in range(0, len(shap_values_df.columns)):
    column_name = shap_values_df.columns[i]
    vals = shap_values_df[shap_values_df.columns[i]]
    vals = vals.sum()
    #vals = vals.abs()
    mean = vals.mean()
    shap_vals[column_name] = mean
print(shap_vals)

from collections import OrderedDict
sorted_agg_dict = dict(sorted(shap_vals.items(), key=lambda x: x[1], reverse=True))

# Plotting the aggregated feature importance

aggregated_data_importance = pd.DataFrame.from_dict(sorted_agg_dict, orient='index',columns=['rank'])
aggregated_data_importance.reset_index(inplace=True)
aggregated_data_importance.rename(columns={"index": "name"}, inplace=True)
top_10 = aggregated_data_importance.head(15)
# print(top_10)

ax = sns.barplot(x="rank", y="name", data=top_10)


In [None]:
#combinding both
def merge_two_dicts(x, y):
    z = x.copy()   
    z.update(y)    
    return z

both_dict = merge_two_dicts(sorted_dict, sorted_agg_dict)
both_dict_sorted = dict(sorted(both_dict.items(), key=lambda x: x[1], reverse=True))

# Plotting the aggregated feature importance

aggregated_data_importance = pd.DataFrame.from_dict(both_dict_sorted, orient='index',columns=['rank'])
aggregated_data_importance.reset_index(inplace=True)
aggregated_data_importance = aggregated_data_importance.drop(aggregated_data_importance.index[0])
aggregated_data_importance.rename(columns={"index": "name"}, inplace=True)
top_10 = aggregated_data_importance.head(20)
# print(top_10)

ax = sns.barplot(x="rank", y="name", data=top_10,palette=("PuRd"), alpha=0.8, linewidth=4)

In [None]:
figure = ax.get_figure() 
figure.set_size_inches(20, 8)

#figure.tight_layout()
figure.savefig('/home/kiwitn01/master_thesis_hypertension-complications/Deep_Learning/SHAP/Final_SHAP_DatasetA/All3_Final_Shap_sum_values_final_without_age.png', bbox_inches='tight', dbi = 300)


In [None]:
#always save under correct name
pd.DataFrame.from_dict(data=both_dict_sorted, orient='index').to_csv('/home/kiwitn01/master_thesis_hypertension-complications/Deep_Learning/SHAP/Final_SHAP_DatasetA/SHAP_lists/All3_sum_values_final.csv', header=0)
