# Diabetes Patient Prediction using Neural Networks & Keras

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

We will be trying to predict whether a patient has diabetes using a Diabetes Health Indicators Dataset from the CDC. 

## Load and Preprocess Data

In [26]:
# Load the dataset
cdc_data = pd.read_csv(r"C:\Users\broga\OneDrive\Desktop\MSBA\Adv_ML\Labs\Lab_4\diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

# Change columns to 'category' dtype where appropriate
cat_columns = [
    'Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke',
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
    'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income', 'GenHlth'
]

# Convert to 'category' dtype
for column in cat_columns:
    cdc_data[column] = cdc_data[column].astype('category')

# Verifying the changes
print(cdc_data.dtypes)
cdc_data.head(1)

Diabetes_binary         category
HighBP                  category
HighChol                category
CholCheck               category
BMI                      float64
Smoker                  category
Stroke                  category
HeartDiseaseorAttack    category
PhysActivity            category
Fruits                  category
Veggies                 category
HvyAlcoholConsump       category
AnyHealthcare           category
NoDocbcCost             category
GenHlth                 category
MentHlth                 float64
PhysHlth                 float64
DiffWalk                category
Sex                     category
Age                     category
Education               category
Income                  category
dtype: object


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0


In [8]:
#Define X an y
X = cdc_data.drop('Diabetes_binary', axis=1)
y = cdc_data['Diabetes_binary']

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

#Define reusable Column Transformer
enc = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
scaler = StandardScaler()

ct = ColumnTransformer([
    ("enc", enc, make_column_selector(dtype_include=['category','object'])),
    ("scaler", scaler, make_column_selector(dtype_include=np.number))
]).set_output(transform="pandas")

#Fit transform only on training data to prevent data leakage
X_train_preprocessed = ct.fit_transform(X_train)
X_test_preprocessed = ct.transform(X_test)

print("X train preprocessed size:", X_train_preprocessed.shape)
print("X test preprocessed size:", X_test_preprocessed.shape)

X train preprocessed size: (56553, 63)
X test preprocessed size: (14139, 63)


The CDC data target variable (Diabetes_binary) has already been balanced and therefore we will not have to rebalance the data at all.

## Predictive Modeling with Neural Networks & XGBoost

### Model 1:

Our first model is a relatively simple model with a sigmoid activation function in the output layer and it performs to essentiall the same result as the more complex models below with around 75% accuracy.

In [21]:
# Model Definition 
inputs = keras.Input(shape=(X_train_preprocessed.shape[1],))
x = layers.Dense(16, activation='relu')(inputs)
x = layers.Dense(16, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model1 = keras.Model(inputs=inputs, outputs=outputs, name="model_1")

# Compilation
model1.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model1.summary()

# Fit the model
history1 = model1.fit(X_train_preprocessed, y_train, batch_size=64, epochs=50, validation_split=0.2)

# Evaluate the model using the encoded labels
y_pred_proba = model1.predict(X_test_preprocessed)
y_pred1 = (y_pred_proba > 0.5).astype(int)  

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred1)
precision = precision_score(y_test, y_pred1)
recall = recall_score(y_test, y_pred1)
f1 = f1_score(y_test, y_pred1)
conf_mat = confusion_matrix(y_test, y_pred1)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_mat)


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 63)]              0         
                                                                 
 dense_9 (Dense)             (None, 16)                1024      
                                                                 
 dense_10 (Dense)            (None, 16)                272       
                                                                 
 dense_11 (Dense)            (None, 1)                 17        
                                                                 
Total params: 1313 (5.13 KB)
Trainable params: 1313 (5.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


### Model 2:

Model 2 was similar to Model 1 although I made it more complicated and it had very similar performance. It is interesting to not that I included more neurons, batch normalization, and the dropout method and am unable to increase the accuracy past 0.75

In [22]:
# Model Definition
inputs = keras.Input(shape=(X_train_preprocessed.shape[1],))
x = layers.Dense(64, activation='relu')(inputs)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model2 = keras.Model(inputs=inputs, outputs=outputs, name="model_2")

# Compilation
model2.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model2.summary()

# Fit the model
history2 = model2.fit(X_train_preprocessed, y_train, batch_size=32, epochs=50, validation_split=0.2)

# Evaluate the model using the encoded labels
y_pred_proba = model2.predict(X_test_preprocessed)
y_pred2 = (y_pred_proba > 0.5).astype(int) 

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred2)
precision = precision_score(y_test, y_pred2)
recall = recall_score(y_test, y_pred2)
f1 = f1_score(y_test, y_pred2)
conf_mat = confusion_matrix(y_test, y_pred2)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_mat)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 63)]              0         
                                                                 
 dense_12 (Dense)            (None, 64)                4096      
                                                                 
 batch_normalization_2 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_13 (Dense)            (None, 64)                4160      
                                                                 
 batch_normalization_3 (Bat  (None, 64)                256       
 chNormalization)                                          

### Model 3:

Model 3 deviates from the models above with the 'tanh' activation function within the hidden layers as well as using a 'rmsprop' optimizer instead of 'adam'. This led to the model being slightly worse, but still performing relatively close

In [23]:
# Model Definition
inputs = keras.Input(shape=(X_train_preprocessed.shape[1],))
x = layers.Dense(32, activation='tanh')(inputs)
x = layers.Dense(32, activation='tanh')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model3 = keras.Model(inputs=inputs, outputs=outputs, name="model_3")

# Compilation
model3.compile(
    optimizer='rmsprop',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model3.summary()

# Fit the model
history3 = model3.fit(X_train_preprocessed, y_train, batch_size=64, epochs=50, validation_split=0.25)

# Evaluate the model using the encoded labels
y_pred_proba = model3.predict(X_test_preprocessed)
y_pred3 = (y_pred_proba > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred3)
precision = precision_score(y_test, y_pred3)
recall = recall_score(y_test, y_pred3)
f1 = f1_score(y_test, y_pred3)
conf_mat = confusion_matrix(y_test, y_pred3)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_mat)


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 63)]              0         
                                                                 
 dense_15 (Dense)            (None, 32)                2048      
                                                                 
 dense_16 (Dense)            (None, 32)                1056      
                                                                 
 dense_17 (Dense)            (None, 1)                 33        
                                                                 
Total params: 3137 (12.25 KB)
Trainable params: 3137 (12.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/5

### Model 4: XGBoost Classifer to compare

My best XGBoost Classifier model from a previous lab performs very similar to the Neural Network models above, although with a slightly lower accuracy. It seems that XGBoost can get close to a Neural Network given this dataset, although our first 2 models above outperform the XGBoost model in terms of accuracy.

In [20]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
         make_column_selector(dtype_include=['category','object'])), 
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ],
    remainder="passthrough"
)

# Pipeline setup with XGBoost classifier
xgb_pipeline = Pipeline([
    ("preprocessing", ct),  
    ("xgboost", XGBClassifier(learning_rate=0.2,max_depth=4,n_estimators=200,use_label_encoder=False, eval_metric='logloss', random_state=1))
])

# Define your scorers
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0)
}

# Perform cross-validation
results = cross_validate(xgb_pipeline, X, y, cv=5, scoring=scoring)

# Display results
print("Accuracy: ", np.mean(results['test_accuracy']))
print("Precision: ", np.mean(results['test_precision']))
print("Recall: ", np.mean(results['test_recall']))
print("F1 Score: ", np.mean(results['test_f1']))

Accuracy:  0.7516552424435878
Precision:  0.73228841479165
Recall:  0.7933288940316137
F1 Score:  0.7615351078685288
