In [71]:
import os
import warnings
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import roc_auc_score, average_precision_score

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [65]:
# Load your data
feature_columns = ['LOS', 'AGE', 'GENDER_M', 'ETHNICITY_Asian', 'ETHNICITY_Black', 'ETHNICITY_Hispanic', 'ETHNICITY_Native_Hawaiian', 'ETHNICITY_Other', 'ETHNICITY_White']
data = pd.read_csv('admit_modified.csv', index_col=False)
data.dropna(subset = feature_columns, inplace = True)
data.reset_index(drop=True, inplace = True)
data['codes'] =  data['PROCEDURE_AND_DIAGNOSIS_ICD'].apply(lambda x: x.replace("'", "")[1:-1].split(", "))
X = data.drop(['MORTALITY_30_DAY'], axis = 1)
y = data['MORTALITY_30_DAY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
X_train.reset_index(drop=True, inplace = True)
X_test.reset_index(drop=True, inplace = True)
y_train.reset_index(drop=True, inplace = True)
y_test.reset_index(drop=True, inplace = True)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(28718, 49) (28718,) (7180, 49) (7180,)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [66]:
# train skip-gram model
window_size = 71 # max(X_train['codes'].apply(lambda x: len(x))) = 71
skipgram = Word2Vec(vector_size=100, window=window_size, min_count=1, sg=1)
skipgram.build_vocab(data['codes'])
skipgram.train(X_train['codes'], total_examples=skipgram.corpus_count, epochs=skipgram.epochs)
# Extract embeddings
embeddings = {word: skipgram.wv[word] for word in skipgram.wv.index_to_key}
embeddings

{'diag_4019': array([-0.11404552,  0.22197641, -0.04064567, -0.33815977,  0.04780869,
        -0.07465956,  0.10358305,  0.86963004, -0.03640486, -0.05822937,
         0.19611755, -0.1107047 ,  0.09522054,  0.1799139 ,  0.19793206,
        -0.02712559, -0.0310963 , -0.00893739, -0.01856768, -0.20989007,
         0.2337316 , -0.08024856, -0.25752985, -0.06748597, -0.1558223 ,
        -0.21433808,  0.07580913, -0.14886913, -0.05446392,  0.22592215,
         0.06932551,  0.13771582, -0.04915157, -0.00223882, -0.2645313 ,
         0.20851299,  0.08895487, -0.38338983, -0.2649923 ,  0.22203778,
         0.01668598, -0.2172358 , -0.1468225 , -0.063306  ,  0.25295144,
        -0.09673244, -0.2973475 , -0.27587017,  0.16540399,  0.01803732,
         0.27931446,  0.08564819, -0.11931816, -0.18907818,  0.21836966,
         0.2059934 ,  0.39684546,  0.38958225, -0.1420959 , -0.10844444,
         0.25103742, -0.0192792 , -0.38947198,  0.40839097,  0.00342365,
         0.08260316, -0.0521316 ,  0.1

In [67]:
def codes_to_emb(codes):
    return np.mean(np.array([embeddings.get(c) for c in codes]), axis=0)
X_train['embedding'] = X_train['codes'].apply(lambda x: codes_to_emb(x))
X_test['embedding'] = X_test['codes'].apply(lambda x: codes_to_emb(x))
X_train = pd.concat([X_train['embedding'].apply(pd.Series), X_train[feature_columns]], axis=1)
X_test = pd.concat([X_test['embedding'].apply(pd.Series), X_test[feature_columns]], axis=1)


In [68]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(28718, 109) (28718,) (7180, 109) (7180,)


In [72]:


class MetricsCallback(Callback):
    def __init__(self, X_train, y_train, X_test, y_test):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def on_epoch_end(self, epoch, logs=None):
        y_train_pred = self.model.predict(self.X_train)
        y_test_pred = self.model.predict(self.X_test)
        auroc_train = roc_auc_score(self.y_train, y_train_pred)
        auroc_test = roc_auc_score(self.y_test, y_test_pred)
        auprc_train = average_precision_score(self.y_train, y_train_pred)
        auprc_test = average_precision_score(self.y_test, y_test_pred)
        print(f"\nEpoch {epoch+1}: AUROC Train: {auroc_train}, AUROC Test: {auroc_test}, "
              f"AUPRC Train: {auprc_train}, AUPRC Test: {auprc_test}")

def cnn(X_train, y_train, X_test, y_test):
    # Build and compile the neural network
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=109))  # 100 embedding dims + 9 other features
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(64))
    model.add(Dense(8))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))  # Suitable for binary classification

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Initialize metrics callback
    metrics_callback = MetricsCallback(X_train, y_train, X_test, y_test)

    # Train the model
    model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test), callbacks=[metrics_callback])

    return model

# Example usage
model = cnn(X_train, y_train, X_test, y_test)


Epoch 1/30

Epoch 1: AUROC Train: 0.6906791938230163, AUROC Test: 0.6704736805770253, AUPRC Train: 0.07878583640001435, AUPRC Test: 0.07844826572229883
Epoch 2/30

Epoch 2: AUROC Train: 0.7956386592790399, AUROC Test: 0.7828900059560078, AUPRC Train: 0.13966976866157768, AUPRC Test: 0.13484912228287993
Epoch 3/30

Epoch 3: AUROC Train: 0.811230964628677, AUROC Test: 0.800442862845874, AUPRC Train: 0.16005531461053213, AUPRC Test: 0.14839528764777268
Epoch 4/30

Epoch 4: AUROC Train: 0.8169653021406944, AUROC Test: 0.8010984953897756, AUPRC Train: 0.17934860762709245, AUPRC Test: 0.15360010462669707
Epoch 5/30

Epoch 5: AUROC Train: 0.8247067677461083, AUROC Test: 0.8144491293036873, AUPRC Train: 0.186294646051831, AUPRC Test: 0.16127373588198016
Epoch 6/30

Epoch 6: AUROC Train: 0.8285434719959425, AUROC Test: 0.8161917654439198, AUPRC Train: 0.1894226425032835, AUPRC Test: 0.16565240888420052
Epoch 7/30

Epoch 7: AUROC Train: 0.8320599524766953, AUROC Test: 0.8218433265483154, AUPRC T