**This hybrid approach will average predictions from two models**

In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from imblearn.over_sampling import SMOTE

In [22]:
# Load and preprocess dataset
data_filepath = '../data/processed/cleaned_diabetes_one_hot_encoding.csv'
df = pd.read_csv(data_filepath)

In [23]:
# Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [24]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [25]:
# Normalize the features for MLP
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
# XGBoost Model Training
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.8647783037718892,
    learning_rate=0.04840129264965742,
    max_depth=4,  # Adjusted from original 'max_depth': 1, adding 3 as per adjustment logic
    min_child_weight=1.0,
    n_estimators=329,
    subsample=0.9160065601766973,
    objective='binary:logistic',
    seed=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict_proba(X_test)[:, 1]

In [28]:
# MLP Model Training
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp_model.fit(X_train_scaled, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=0)
y_pred_mlp = mlp_model.predict(X_test_scaled).ravel()



In [29]:
# Combine predictions by averaging the probability scores
combined_pred_prob = (y_pred_xgb + y_pred_mlp) / 2
combined_pred_final = np.where(combined_pred_prob > 0.5, 1, 0)

# Evaluation
accuracy_combined = accuracy_score(y_test, combined_pred_final)
roc_auc_combined = roc_auc_score(y_test, combined_pred_prob)
conf_matrix_combined = confusion_matrix(y_test, combined_pred_final)
class_report_combined = classification_report(y_test, combined_pred_final)

# Displaying the combined model results
print(f"Combined Model Accuracy: {accuracy_combined}")
print(f"Combined Model ROC AUC: {roc_auc_combined}")
print("Combined Model Confusion Matrix:\n", conf_matrix_combined)
print("Combined Model Classification Report:\n", class_report_combined)

Combined Model Accuracy: 0.9733369650747533
Combined Model ROC AUC: 0.9969812659083899
Combined Model Confusion Matrix:
 [[16175   223]
 [  658 15986]]
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     16398
           1       0.99      0.96      0.97     16644

    accuracy                           0.97     33042
   macro avg       0.97      0.97      0.97     33042
weighted avg       0.97      0.97      0.97     33042



In [30]:
'''
Results from previous run:
Combined Model Accuracy: 0.9733369650747533
Combined Model ROC AUC: 0.9969812659083899
Combined Model Confusion Matrix:
 [[16175   223]
 [  658 15986]]
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     16398
           1       0.99      0.96      0.97     16644

    accuracy                           0.97     33042
   macro avg       0.97      0.97      0.97     33042
weighted avg       0.97      0.97      0.97     33042

'''

'\nResults from previous run:\nCombined Model Accuracy: 0.9733369650747533\nCombined Model ROC AUC: 0.9969812659083899\nCombined Model Confusion Matrix:\n [[16175   223]\n [  658 15986]]\nCombined Model Classification Report:\n               precision    recall  f1-score   support\n\n           0       0.96      0.99      0.97     16398\n           1       0.99      0.96      0.97     16644\n\n    accuracy                           0.97     33042\n   macro avg       0.97      0.97      0.97     33042\nweighted avg       0.97      0.97      0.97     33042\n\n'