In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, average_precision_score

In [2]:
# 1. Load the dataset
data = pd.read_csv("merged_data.csv")

# 2. Select features and target variable
list_bp = ['avg_dbp', 'avg_diff', 'avg_sbp', 'max_sbp']
list_ed = ['age', 'sex', 'language', 'insurance_type', 'primary_care', 'ed_name',
           'bpa_response', 'htn_on_pl', 'htn_on_pmh', 'hld_on_pl', 'hld_on_pmh',
           'family_dm', 'tobacco_user', 'htn_meds', 'statin_meds', 'disposition',
           'detailed_race', 'weight', 'bmi', 'hba1c', 'height', 'sbp_1st', 'dbp_1st', 'poct_gluc']
list_lab = ['max_value_GLUCOSE', 'avg_value_GLUCOSE', 'max_value_CREATININE',
            'min_value_CREATININE', 'min_value_GLUCOSE', 'avg_value_CREATININE',
            'avg_value_HEMOGLOBIN A1C', 'max_value_HEMOGLOBIN A1C', 'min_value_HEMOGLOBIN A1C',
            'min_value_GLUCOSE, POC', 'avg_value_GLUCOSE, POC', 'max_value_GLUCOSE, POC']
list_geo = ['total_pop', 'households', 'housing_units', 'p_children', 'p_elderly',
            'p_adults', 'p_female', 'mdn_age', 'p_nhwhite', 'p_nhblack', 'p_hispanic',
            'p_nhasian', 'p_other', 'p_moved', 'p_longcommute', 'p_marriednone', 'p_marriedkids',
            'p_singlenone', 'p_malekids', 'p_femalekids', 'p_cohabitkids', 'hh_mdnincome']
list_visit = ['visit_type']

# Define features and target
X_all = data[list_bp + list_ed + list_lab + list_geo + list_visit]
y = data['pcp_followup'].map({'Yes': 1, 'No': 0}).astype(int)

In [3]:
# Data preprocessing
numeric_cols = X_all.select_dtypes(include=['number']).columns
categorical_cols = X_all.select_dtypes(exclude=['number']).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_preprocessed = preprocessor.fit_transform(X_all)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [4]:
# Build Neural Network Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),  # First hidden layer
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),  # Second hidden layer
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),  # Third hidden layer
    BatchNormalization(),
    Dropout(0.2),

    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, verbose=1)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Compute accuracy & classification metrics
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_prob)
auc_pr = average_precision_score(y_test, y_pred_prob)

print(f"Test Accuracy: {accuracy:.2f}")
print(f"AUC-ROC: {auc_roc:.2f}")
print(f"AUC-PR: {auc_pr:.2f}")
print(classification_report(y_test, y_pred))

Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.5890 - loss: 0.7416 - val_accuracy: 0.8039 - val_loss: 0.6786
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5335 - loss: 0.7985 - val_accuracy: 0.8039 - val_loss: 0.6619
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5011 - loss: 0.8047 - val_accuracy: 0.8039 - val_loss: 0.6468
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5627 - loss: 0.7446 - val_accuracy: 0.8039 - val_loss: 0.6324
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5598 - loss: 0.8018 - val_accuracy: 0.8039 - val_loss: 0.6200
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6055 - loss: 0.7615 - val_accuracy: 0.8039 - val_loss: 0.6084
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
