**This hybrid approach will average predictions from two models**

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from imblearn.over_sampling import SMOTE

In [2]:
# Load and preprocess dataset
data_filepath = '../data/processed/cleaned_diabetes_one_hot_encoding.csv'
df = pd.read_csv(data_filepath)

In [3]:
# Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Address class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [4]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [5]:
# Normalize the features for MLP
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# XGBoost Model Training
xgb_model = xgb.XGBClassifier(
    colsample_bytree=0.8647783037718892,
    learning_rate=0.04840129264965742,
    max_depth=4,  # Adjusted from original 'max_depth': 1, adding 3 as per adjustment logic
    min_child_weight=1.0,
    n_estimators=329,
    subsample=0.9160065601766973,
    objective='binary:logistic',
    seed=42
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict_proba(X_test)[:, 1]

In [7]:
# MLP Model Training
mlp_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
mlp_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
mlp_model.fit(X_train_scaled, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=0)
y_pred_mlp = mlp_model.predict(X_test_scaled).ravel()



In [29]:
# Combine predictions by averaging the probability scores
combined_pred_prob = (y_pred_xgb + y_pred_mlp) / 2
combined_pred_final = np.where(combined_pred_prob > 0.5, 1, 0)

# Evaluation
accuracy_combined = accuracy_score(y_test, combined_pred_final)
roc_auc_combined = roc_auc_score(y_test, combined_pred_prob)
conf_matrix_combined = confusion_matrix(y_test, combined_pred_final)
class_report_combined = classification_report(y_test, combined_pred_final)

# Displaying the combined model results
print(f"Combined Model Accuracy: {accuracy_combined}")
print(f"Combined Model ROC AUC: {roc_auc_combined}")
print("Combined Model Confusion Matrix:\n", conf_matrix_combined)
print("Combined Model Classification Report:\n", class_report_combined)

Combined Model Accuracy: 0.9733369650747533
Combined Model ROC AUC: 0.9969812659083899
Combined Model Confusion Matrix:
 [[16175   223]
 [  658 15986]]
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     16398
           1       0.99      0.96      0.97     16644

    accuracy                           0.97     33042
   macro avg       0.97      0.97      0.97     33042
weighted avg       0.97      0.97      0.97     33042



In [30]:
'''
Results from previous run:
Combined Model Accuracy: 0.9733369650747533
Combined Model ROC AUC: 0.9969812659083899
Combined Model Confusion Matrix:
 [[16175   223]
 [  658 15986]]
Combined Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     16398
           1       0.99      0.96      0.97     16644

    accuracy                           0.97     33042
   macro avg       0.97      0.97      0.97     33042
weighted avg       0.97      0.97      0.97     33042

'''

'\nResults from previous run:\nCombined Model Accuracy: 0.9733369650747533\nCombined Model ROC AUC: 0.9969812659083899\nCombined Model Confusion Matrix:\n [[16175   223]\n [  658 15986]]\nCombined Model Classification Report:\n               precision    recall  f1-score   support\n\n           0       0.96      0.99      0.97     16398\n           1       0.99      0.96      0.97     16644\n\n    accuracy                           0.97     33042\n   macro avg       0.97      0.97      0.97     33042\nweighted avg       0.97      0.97      0.97     33042\n\n'

**Stacking Hybrid Approach**

In [8]:
from sklearn.linear_model import LogisticRegression

# Generate predictions on the training set for stacking
y_pred_xgb_train = xgb_model.predict_proba(X_train)[:, 1]
y_pred_mlp_train = mlp_model.predict(X_train_scaled).ravel()

# Stack predictions to create a new training dataset for the meta-model
stacked_predictions_train = np.column_stack((y_pred_xgb_train, y_pred_mlp_train))

# Train the meta-model (Logistic Regression) on the stacked predictions
meta_model = LogisticRegression()
meta_model.fit(stacked_predictions_train, y_train)

# Generate stacked predictions on the test set
stacked_predictions_test = np.column_stack((y_pred_xgb, y_pred_mlp))

# Use the meta-model to make the final prediction
final_pred_prob = meta_model.predict_proba(stacked_predictions_test)[:, 1]
final_pred = np.where(final_pred_prob > 0.5, 1, 0)

# Evaluate the stacking ensemble model
accuracy_stacking = accuracy_score(y_test, final_pred)
roc_auc_stacking = roc_auc_score(y_test, final_pred_prob)
conf_matrix_stacking = confusion_matrix(y_test, final_pred)
class_report_stacking = classification_report(y_test, final_pred)

# Display the stacking ensemble model results
print(f"Stacking Ensemble Model Accuracy: {accuracy_stacking}")
print(f"Stacking Ensemble Model ROC AUC: {roc_auc_stacking}")
print("Stacking Ensemble Model Confusion Matrix:\n", conf_matrix_stacking)
print("Stacking Ensemble Model Classification Report:\n", class_report_stacking)

Stacking Ensemble Model Accuracy: 0.9763028872344289
Stacking Ensemble Model ROC AUC: 0.9974446458306605
Stacking Ensemble Model Confusion Matrix:
 [[16131   267]
 [  516 16128]]
Stacking Ensemble Model Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98     16398
           1       0.98      0.97      0.98     16644

    accuracy                           0.98     33042
   macro avg       0.98      0.98      0.98     33042
weighted avg       0.98      0.98      0.98     33042



**Weighted Averaging**

In [15]:
# Assign weights to predictions (assuming MLP is more reliable)
weight_xgb = 1
weight_mlp = 2  # Giving more weight to the MLP model's predictions

# Calculate weighted average of probability scores
weighted_avg_pred_prob = (y_pred_xgb * weight_xgb + y_pred_mlp * weight_mlp) / (weight_xgb + weight_mlp)
weighted_avg_pred_final = np.where(weighted_avg_pred_prob > 0.5, 1, 0)

# Evaluate the weighted averaging hybrid model
accuracy_weighted_avg = accuracy_score(y_test, weighted_avg_pred_final)
roc_auc_weighted_avg = roc_auc_score(y_test, weighted_avg_pred_prob)
conf_matrix_weighted_avg = confusion_matrix(y_test, weighted_avg_pred_final)
class_report_weighted_avg = classification_report(y_test, weighted_avg_pred_final)

# Display the weighted averaging hybrid model results
print(f"Weighted Averaging Hybrid Model Accuracy: {accuracy_weighted_avg}")
print(f"Weighted Averaging Hybrid Model ROC AUC: {roc_auc_weighted_avg}")
print("Weighted Averaging Hybrid Model Confusion Matrix:\n", conf_matrix_weighted_avg)
print("Weighted Averaging Hybrid Model Classification Report:\n", class_report_weighted_avg)

Weighted Averaging Hybrid Model Accuracy: 0.9673445917317354
Weighted Averaging Hybrid Model ROC AUC: 0.9962761631706424
Weighted Averaging Hybrid Model Confusion Matrix:
 [[16093   305]
 [  774 15870]]
Weighted Averaging Hybrid Model Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97     16398
           1       0.98      0.95      0.97     16644

    accuracy                           0.97     33042
   macro avg       0.97      0.97      0.97     33042
weighted avg       0.97      0.97      0.97     33042



**Feature Level Fusion**

In [16]:
# Generate XGBoost predictions on the training data for fusion
y_pred_xgb_train = xgb_model.predict_proba(X_train)[:, 1]

# Scale XGBoost predictions (feature scaling is important for neural networks)
scaler_xgb_pred = StandardScaler()
y_pred_xgb_train_scaled = scaler_xgb_pred.fit_transform(y_pred_xgb_train.reshape(-1, 1))

# Combine XGBoost predictions with original scaled features for the MLP training set
X_train_scaled_with_xgb = np.hstack((X_train_scaled, y_pred_xgb_train_scaled))

# Adjust the MLP model to accommodate the additional XGBoost prediction feature
mlp_model_with_xgb = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled_with_xgb.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
mlp_model_with_xgb.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the MLP model with the combined dataset
mlp_model_with_xgb.fit(X_train_scaled_with_xgb, y_train, epochs=100, validation_split=0.2, batch_size=64, verbose=0)

# Prepare the test data: Generate XGBoost predictions and combine with scaled features
y_pred_xgb_test = xgb_model.predict_proba(X_test)[:, 1]
y_pred_xgb_test_scaled = scaler_xgb_pred.transform(y_pred_xgb_test.reshape(-1, 1))
X_test_scaled_with_xgb = np.hstack((X_test_scaled, y_pred_xgb_test_scaled))

# Predict with the MLP model using the combined test data
y_pred_mlp_with_xgb = mlp_model_with_xgb.predict(X_test_scaled_with_xgb).ravel()

# Final evaluation
final_pred_with_xgb = np.where(y_pred_mlp_with_xgb > 0.5, 1, 0)
accuracy_with_xgb = accuracy_score(y_test, final_pred_with_xgb)
roc_auc_with_xgb = roc_auc_score(y_test, y_pred_mlp_with_xgb)
conf_matrix_with_xgb = confusion_matrix(y_test, final_pred_with_xgb)
class_report_with_xgb = classification_report(y_test, final_pred_with_xgb)

# Display the results
print(f"Feature-Level Fusion Model Accuracy: {accuracy_with_xgb}")
print(f"Feature-Level Fusion Model ROC AUC: {roc_auc_with_xgb}")
print("Feature-Level Fusion Model Confusion Matrix:\n", conf_matrix_with_xgb)
print("Feature-Level Fusion Model Classification Report:\n", class_report_with_xgb)

Feature-Level Fusion Model Accuracy: 0.979450396465105
Feature-Level Fusion Model ROC AUC: 0.9975672586140495
Feature-Level Fusion Model Confusion Matrix:
 [[16350    48]
 [  631 16013]]
Feature-Level Fusion Model Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     16398
           1       1.00      0.96      0.98     16644

    accuracy                           0.98     33042
   macro avg       0.98      0.98      0.98     33042
weighted avg       0.98      0.98      0.98     33042

