In [1]:
from google.colab import drive
drive.mount('/content/drive/')
data_path = "/content/drive/MyDrive/Colab Notebooks/Diatebes_Prediction/"

Mounted at /content/drive/


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

df = pd.read_csv(data_path + "clean_data.csv")
df.head()

Unnamed: 0,Outcome,Glucose,BMI,Age,Pregnancies,DiabetesPedigreeFunction,BloodPressure
0,1,0.960533,0.237764,1.496973,0.66766,0.695034,-0.024713
1,0,-1.168393,-0.847429,-0.174926,-0.868097,-0.341721,-0.546843
2,1,2.14327,-1.35902,-0.086931,1.281963,0.864069,-0.720886
3,0,-1.033223,-0.614888,-1.054873,-0.868097,-1.03289,-0.546843
4,0,-0.120826,-1.002457,-0.262921,0.360509,-0.905174,0.14933


In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import numpy as np
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split

In [5]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Use SMOTE to oversample minority classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# View the balanced distribution of categories
print(y_res.value_counts())

X_train_res, X_test, y_train_res, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

print(y_train_res.value_counts())
print(y_test.value_counts())
print(X_train_res)

Outcome
1    449
0    449
Name: count, dtype: int64
Outcome
0    359
1    359
Name: count, dtype: int64
Outcome
0    90
1    90
Name: count, dtype: int64
      Glucose       BMI       Age  Pregnancies  DiabetesPedigreeFunction  \
350  0.791571 -0.909440 -1.054873    -0.868097                 -0.698574   
743  0.279759  2.313475 -0.294529    -1.070878                  0.163202   
456 -1.438732  0.191255 -0.790889    -0.868097                  3.038999   
82   1.467420  1.074912  0.001063    -0.868097                  2.930065   
730 -0.822020 -0.541250  0.826014     2.593755                  1.349189   
..        ...       ...       ...          ...                       ...   
335  0.386061  0.346283 -0.966878    -1.175249                 -0.923956   
487 -0.931845 -0.010281  1.144994     0.667660                 -1.340911   
863  0.977922 -0.923401  1.398685     1.819030                 -0.754063   
234 -0.391166 -1.048965 -0.966878    -0.560946                  1.649148   
61  -1.945

In [6]:
# Model Selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier, log_evaluation
from lightgbm.basic import LightGBMError
from sklearn.metrics import roc_auc_score, make_scorer

In [8]:
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, roc_auc_score,
    RocCurveDisplay
)
import matplotlib.pyplot as plt

def evaluate_model(model, X, y):
  # Evaluate the model and return the indicator dictionary
  y_pred = model.predict(X)
  y_proba = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X)

  return {
    "Accuracy": accuracy_score(y, y_pred),
    "Precision": precision_score(y, y_pred),
    "Recall": recall_score(y, y_pred),
    "F1": f1_score(y, y_pred),
    "AUC-ROC": roc_auc_score(y, y_proba)
  }

In [9]:
lr = LogisticRegression(
    max_iter = 100,
    C = 0.01,
    solver = "lbfgs",
    class_weight = "balanced",
    random_state = 42
)
lr.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lr, X_train_res, y_train_res)
test_metrics = evaluate_model(lr, X_test, y_test)

print("Logistic Regression")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

#RandomForest
rf = RandomForestClassifier(
    n_estimators = 50,
    max_depth = 5,
    min_samples_leaf = 10,
    min_samples_split = 10,
    class_weight = "balanced",
    random_state = 42
)
rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rf, X_test, y_test)

print("Random Forest")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

#XGBoost
xgb = XGBClassifier(
    colsample_bytree = 0.9,
    min_child_weight = 5,
    learning_rate = 0.1,
    max_depth = 15,
    n_estimators = 25,
    subsample = 0.8,
    eval_metric = "logloss",
)
xgb.fit(X_train_res, y_train_res,)

train_metrics = evaluate_model(xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(xgb, X_test, y_test)

print("XGBoost")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

#LightGBM
lgb = LGBMClassifier(
    colsample_bytree=1.0,
    min_child_samples=15,
    reg_alpha=0.2,
    reg_lambda=0.3,
    subsample=0.7,
    num_leaves=18,
    learning_rate=0.05,
    n_estimators=500
)
lgb.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lgb, X_train_res, y_train_res)
test_metrics = evaluate_model(lgb, X_test, y_test)

print("LightGBM")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Logistic Regression
Train Metrics: {'Accuracy': 0.7479108635097493, 'Precision': 0.7587209302325582, 'Recall': 0.7270194986072424, 'F1': 0.7425320056899004, 'AUC-ROC': np.float64(0.8486122857519728)}
Test Metrics: {'Accuracy': 0.7722222222222223, 'Precision': 0.7951807228915663, 'Recall': 0.7333333333333333, 'F1': 0.7630057803468208, 'AUC-ROC': np.float64(0.8702469135802469)}
Random Forest
Train Metrics: {'Accuracy': 0.8426183844011143, 'Precision': 0.8186528497409327, 'Recall': 0.8802228412256268, 'F1': 0.8483221476510067, 'AUC-ROC': np.float64(0.9244341679533834)}
Test Metrics: {'Accuracy': 0.85, 'Precision': 0.8539325842696629, 'Recall': 0.8444444444444444, 'F1': 0.8491620111731844, 'AUC-ROC': np.float64(0.9135802469135803)}
XGBoost
Train Metrics: {'Accuracy': 0.883008356545961, 'Precision': 0.8608923884514436, 'Recall': 0.9136490250696379, 'F1': 0.8864864864864865, 'AUC-ROC': np.float64(0.9522815620611261)}
Test Metrics: {'Accuracy': 0.8555555555555555, 'Precision': 0.8636363636363

In [10]:
# Hyperparameter tuning (RandomForest)

# Define parameter network
param_grid_rf = {
    "n_estimators" : [20, 50],
    "max_depth" : [5, 10],
    "min_samples_split" : [5, 10],
    "min_samples_leaf" : [5, 10]
}


rf_v = RandomForestClassifier(class_weight = "balanced", random_state = 42)

# Grid Search
grid_search_rf = GridSearchCV(
    estimator = rf_v,
    param_grid = param_grid_rf,
    cv = 10,
    scoring = "accuracy"
)
grid_search_rf.fit(X_train_res, y_train_res)

# Output optimal parameters and scores
print("Best Parameters(lr):",grid_search_rf.best_params_)

rf = grid_search_rf.best_estimator_

train_metrics = evaluate_model(rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rf, X_test, y_test)

print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Best Parameters(lr): {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 20}
Train Metrics: {'Accuracy': 0.8955431754874652, 'Precision': 0.8717277486910995, 'Recall': 0.9275766016713092, 'F1': 0.8987854251012146, 'AUC-ROC': np.float64(0.9720827740318589)}
Test Metrics: {'Accuracy': 0.8555555555555555, 'Precision': 0.8478260869565217, 'Recall': 0.8666666666666667, 'F1': 0.8571428571428571, 'AUC-ROC': np.float64(0.9293827160493827)}


In [11]:
# Define a meta model (using LogisticRegression)
meta_model = LogisticRegression(random_state=42)

# Define Stacking Ensemble Model
rflgb_st = StackingClassifier(
    estimators=[('rf', rf), ('lgb', lgb)],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

rflgb_st.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rflgb_st, X_train_res, y_train_res)
test_metrics = evaluate_model(rflgb_st, X_test, y_test)

print("Stacking(meta:LR): Random Forest + LightGBM")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

lrxgb_st = StackingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

lrxgb_st.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lrxgb_st, X_train_res, y_train_res)
test_metrics = evaluate_model(lrxgb_st, X_test, y_test)

print("Stacking(meta:LR): Logistic Regression + XGBoost")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Stacking(meta:LR): Random Forest + LightGBM
Train Metrics: {'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1': 1.0, 'AUC-ROC': np.float64(1.0)}
Test Metrics: {'Accuracy': 0.8888888888888888, 'Precision': 0.8977272727272727, 'Recall': 0.8777777777777778, 'F1': 0.8876404494382022, 'AUC-ROC': np.float64(0.9406172839506173)}
Stacking(meta:LR): Logistic Regression + XGBoost
Train Metrics: {'Accuracy': 0.8649025069637883, 'Precision': 0.8502673796791443, 'Recall': 0.8857938718662952, 'F1': 0.8676671214188267, 'AUC-ROC': np.float64(0.934955501586735)}
Test Metrics: {'Accuracy': 0.8444444444444444, 'Precision': 0.8604651162790697, 'Recall': 0.8222222222222222, 'F1': 0.8409090909090909, 'AUC-ROC': np.float64(0.9187654320987655)}


In [12]:
# Define meta model (using RandomForest)
meta_model_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=5,
    min_samples_split=10,
    random_state=42
)

rflgb_st_rf = StackingClassifier(
    estimators=[('rf', rf), ('lgb', lgb)],
    final_estimator=meta_model_rf,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

rflgb_st_rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(rflgb_st_rf, X_train_res, y_train_res)
test_metrics = evaluate_model(rflgb_st_rf, X_test, y_test)

print("Stacking(meta:RF): Random Forest + LightGBM")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

lrxgb_st_rf = StackingClassifier(
    estimators=[('lr', lr), ('xgb', xgb)],
    final_estimator=meta_model_rf,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

lrxgb_st_rf.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(lrxgb_st_rf, X_train_res, y_train_res)
test_metrics = evaluate_model(lrxgb_st_rf, X_test, y_test)

print("Stacking(meta:RF): Logistic Regression + XGBoost")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Stacking(meta:RF): Random Forest + LightGBM
Train Metrics: {'Accuracy': 0.9944289693593314, 'Precision': 1.0, 'Recall': 0.9888579387186629, 'F1': 0.9943977591036415, 'AUC-ROC': np.float64(0.9999999999999999)}
Test Metrics: {'Accuracy': 0.8777777777777778, 'Precision': 0.9047619047619048, 'Recall': 0.8444444444444444, 'F1': 0.8735632183908046, 'AUC-ROC': np.float64(0.9303086419753086)}
Stacking(meta:RF): Logistic Regression + XGBoost
Train Metrics: {'Accuracy': 0.8537604456824512, 'Precision': 0.8342105263157895, 'Recall': 0.883008356545961, 'F1': 0.857916102841678, 'AUC-ROC': np.float64(0.933135993668578)}
Test Metrics: {'Accuracy': 0.8611111111111112, 'Precision': 0.8651685393258427, 'Recall': 0.8555555555555555, 'F1': 0.8603351955307262, 'AUC-ROC': np.float64(0.9207407407407406)}


In [13]:
from sklearn.ensemble import VotingClassifier

# Voting
en_lr_rf_xgb = VotingClassifier(
    estimators = [
        ("lr", lr),
        ("rf", rf),
        ("xgb", xgb)
    ],
    voting = "soft"
)

en_lr_xgb = VotingClassifier(
    estimators = [
        ("lf", lr),
        ("rf", rf)
    ],
    voting = "soft"
)

en_lr_rf_xgb.fit(X_train_res, y_train_res)
en_lr_xgb.fit(X_train_res, y_train_res)

train_metrics = evaluate_model(en_lr_rf_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(en_lr_rf_xgb, X_test, y_test)

print("Voting: Logistic Regression + Random Forest + XGBoost")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

train_metrics = evaluate_model(en_lr_xgb, X_train_res, y_train_res)
test_metrics = evaluate_model(en_lr_xgb, X_test, y_test)

print("Voting:Logistic Regression + XGBoost")
print("Train Metrics:", train_metrics)
print("Test Metrics:", test_metrics)

Voting: Logistic Regression + Random Forest + XGBoost
Train Metrics: {'Accuracy': 0.871866295264624, 'Precision': 0.8485639686684073, 'Recall': 0.9052924791086351, 'F1': 0.876010781671159, 'AUC-ROC': np.float64(0.9490382601004027)}
Test Metrics: {'Accuracy': 0.8555555555555555, 'Precision': 0.8636363636363636, 'Recall': 0.8444444444444444, 'F1': 0.8539325842696629, 'AUC-ROC': np.float64(0.9245679012345679)}
Voting:Logistic Regression + XGBoost
Train Metrics: {'Accuracy': 0.8635097493036211, 'Precision': 0.8407310704960835, 'Recall': 0.8969359331476323, 'F1': 0.8679245283018868, 'AUC-ROC': np.float64(0.9416283238025777)}
Test Metrics: {'Accuracy': 0.85, 'Precision': 0.8387096774193549, 'Recall': 0.8666666666666667, 'F1': 0.8524590163934426, 'AUC-ROC': np.float64(0.9197530864197531)}


In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
# Definition model
nn = Sequential()

# Input layer, two hidden layers, and output layer
nn.add(Dense(units=64, activation='relu', input_shape=(X_train_res.shape[1],)))
nn.add(Dense(units=32, activation='relu'))
nn.add(Dense(units=1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.01)

# Compilation Model
nn.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Print Model Summary
nn.summary()

# Training model
history = nn.fit(X_train_res, y_train_res, epochs=50, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.6929 - loss: 0.5889 - val_accuracy: 0.7431 - val_loss: 0.5050
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7925 - loss: 0.4440 - val_accuracy: 0.7569 - val_loss: 0.4873
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.7800 - loss: 0.4415 - val_accuracy: 0.7639 - val_loss: 0.4868
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7840 - loss: 0.4514 - val_accuracy: 0.7917 - val_loss: 0.4860
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8117 - loss: 0.4098 - val_accuracy: 0.7917 - val_loss: 0.5060
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8023 - loss: 0.4317 - val_accuracy: 0.7708 - val_loss: 0.4738
Epoch 7/50
[1m18/18[0m [32m━━━━━

In [17]:
# Evaluate the model on the test set
loss, accuracy = nn.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Predictive testing set
y_pred = nn.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8336 - loss: 0.5174
Test Loss: 0.5164456963539124
Test Accuracy: 0.8333333134651184
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.81      0.83        90
           1       0.82      0.86      0.84        90

    accuracy                           0.83       180
   macro avg       0.83      0.83      0.83       180
weighted avg       0.83      0.83      0.83       180

Confusion Matrix:
[[73 17]
 [13 77]]


In [18]:
# Evaluate all models
models = {
    "Logistic Regression": lr,
    "Random Forest": rf,
    "XGBoost": xgb,
    "LightGBM":lgb,
    #"Neural Network": nn,
    "L_R_X_Voting": en_lr_rf_xgb,
    "L_X_Voting" : en_lr_xgb,
    "Stacking(LR)_RF_LGB": rflgb_st,
    "Stacking(LR)_LR_XGB": lrxgb_st,
    "Stacking(RF)_RF_LGB" : rflgb_st_rf,
    "Stacking(RF)_LR_XGB": lrxgb_st_rf
}

results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_test, y_test)

pd.DataFrame(results).T.round(3)

Unnamed: 0,Accuracy,Precision,Recall,F1,AUC-ROC
Logistic Regression,0.772,0.795,0.733,0.763,0.87
Random Forest,0.856,0.848,0.867,0.857,0.929
XGBoost,0.856,0.864,0.844,0.854,0.927
LightGBM,0.889,0.889,0.889,0.889,0.942
L_R_X_Voting,0.856,0.864,0.844,0.854,0.925
L_X_Voting,0.85,0.839,0.867,0.852,0.92
Stacking(LR)_RF_LGB,0.889,0.898,0.878,0.888,0.941
Stacking(LR)_LR_XGB,0.844,0.86,0.822,0.841,0.919
Stacking(RF)_RF_LGB,0.878,0.905,0.844,0.874,0.93
Stacking(RF)_LR_XGB,0.861,0.865,0.856,0.86,0.921


In [19]:
import joblib
from google.colab import files

joblib.dump(rf,"rf_model.pkl")
files.download("rf_model.pkl")

joblib.dump(lgb,"lgb_model.pkl")
files.download("lgb_model.pkl")

joblib.dump(rflgb_st,"stacking_model.pkl")
files.download("stacking_model.pkl")

joblib.dump(en_lr_rf_xgb,"voting_model.pkl")
files.download("voting_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>