In [118]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import AUC
import tensorflow as tf

# Load your data
data = pd.read_csv('admit_modified.csv').dropna(subset = ['LOS', 'AGE', 'GENDER_M', 'ETHNICITY_Asian', 'ETHNICITY_Black', 'ETHNICITY_Hispanic', 'ETHNICITY_Native_Hawaiian', 'ETHNICITY_Other', 'ETHNICITY_White']).reset_index(drop=True)

# Ensure that each item in 'PROCEDURE_AND_DIAGNOSIS_ICD' is a list of strings
# Assuming the column is already in the correct format

# Define a fixed window size for Word2Vec
# You may adjust this based on your dataset's characteristics
window_size = 71

# Initialize Word2Vec model with the fixed window size
skipgram = Word2Vec(vector_size=100, window=window_size, min_count=1, sg=1)

# Build the vocabulary and train the model

all_procedures = [data['PROCEDURE_AND_DIAGNOSIS_ICD'][i].replace("'", "")[1:-1].split(", ") for i in range(len(data))]

skipgram.build_vocab(all_procedures)
skipgram.train(all_procedures, total_examples=skipgram.corpus_count, epochs=skipgram.epochs)

# Extract embeddings
embeddings = {word: skipgram.wv[word] for word in skipgram.wv.index_to_key}

# Map each 'PROCEDURE_AND_DIAGNOSIS_ICD' to its embedding
data['embedding'] = data['PROCEDURE_AND_DIAGNOSIS_ICD'].apply(lambda x: np.mean([embeddings.get(word, np.zeros(100)) for word in x], axis=0))

# Prepare other features and normalize them
feature_columns = ['LOS', 'AGE', 'GENDER_M', 'ETHNICITY_Asian', 'ETHNICITY_Black', 'ETHNICITY_Hispanic', 'ETHNICITY_Native_Hawaiian', 'ETHNICITY_Other', 'ETHNICITY_White']


# Combine embeddings with other features
X = pd.concat([data['embedding'].apply(pd.Series), data[feature_columns]], axis=1)
y = data['MORTALITY_30_DAY']  # Target column

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and compile the neural network
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=109)) # 100 embedding dims + 9 other features
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(64))
model.add(Dense(8))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Suitable for binary classification

# Compile the model for binary classification
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC')])

# Train the model
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Predict probabilities on the test set
y_pred_proba = model.predict(X_test)

# Compute AUROC score
auroc_score = roc_auc_score(y_test, y_pred_proba)

print(f"AUROC Score: {auroc_score}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
AUROC Score: 0.6694130253601619


In [115]:
# After training the skipgram model...

# Retrieve the list of words in the model's vocabulary
vocab = skipgram.wv.index_to_key

# Print the vocabulary
print("Vocabulary of the skip-gram model:")
print(vocab)
embeddings


Vocabulary of the skip-gram model:
['diag_4019', '41401', '3893', '42731', '4280', '25000', '2724', 'proc_3961', 'proc_9604', '966', 'proc_9671', '5849', '9904', '51881', '2720', '53081', '5990', '8856', '3615', '2859', '9672', '2851', '2449', '496', '486', '3891', '3722', 'proc_8872', '2762', 'V5861', '3051', 'diag_311', '3995', '5070', 'diag_412', '4240', '41071', '3723', '4513', '99592', '2875', '5859', 'V1582', '9915', '40390', '4241', 'V4581', '5119', 'diag_9971', '2761', 'V4582', '0389', '40391', '3324', '8853', '32723', '3612', 'V5867', '42789', '45829', '5180', '49390', '4111', '5856', '4168', 'proc_9920', '2749', 'proc_311', '3613', '2760', '9907', '5845', '78552', '78039', '2767', '331', '3521', 'diag_5185', '4311', '4589', '40', '4271', '4254', '99811', '73300', '27800', '7907', '3572', 'proc_3897', '3404', 'E8798', 'proc_3491', '30000', 'V1251', 'E8788', '8841', '66', '60000', '28521', '2930', '3895', 'E8782', '3607', '99812', '9390', 'diag_4439', '5491', '3606', '00845', '

{'diag_4019': array([ 0.16930084,  0.38188916,  0.14322434, -0.14960708,  0.15934736,
        -0.16756941,  0.14980908,  0.4280844 ,  0.01199308,  0.00504476,
        -0.0703456 , -0.20264682,  0.2225384 , -0.1493732 ,  0.3045713 ,
        -0.07839283,  0.02321472, -0.3445324 ,  0.09772148, -0.4606458 ,
         0.23179038, -0.10221571,  0.11208873, -0.07535817, -0.06883818,
        -0.37555793, -0.17839341,  0.04427835, -0.2159358 , -0.05193114,
         0.06393477, -0.0701663 , -0.06290389, -0.24086812, -0.22170882,
         0.29186696,  0.3294413 ,  0.05151779, -0.14852113,  0.15668678,
         0.35649258, -0.32647786, -0.08038118, -0.23785622, -0.01044213,
        -0.13299784, -0.19290237, -0.23747312, -0.2293529 ,  0.18534692,
        -0.06215005,  0.23235975,  0.0528097 ,  0.03208113,  0.10869468,
         0.2388747 ,  0.0965471 , -0.1804731 , -0.00747793,  0.13180725,
         0.28815395,  0.0277397 , -0.15764794,  0.14760384, -0.15450543,
         0.4806996 ,  0.05229494, -0.0