In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [None]:
# Load dataset (replace with your actual path if needed)
data_path = 'diabetes_balanced.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [None]:
# Feature columns and target
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

In [None]:
feature_order = X.columns.tolist()
with open("feature_order.json", "w") as f:
    json.dump(feature_order, f)

In [None]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the hyperparameter grid to search over.
param_grid = {
    'n_estimators': [100, 200],         # Number of trees in the forest.
    'max_depth': [None, 10, 20],        # Maximum depth of the tree.
    'min_samples_split': [2, 5],        # Minimum number of samples required to split an internal node.
    'min_samples_leaf': [1, 2],         # Minimum number of samples required to be at a leaf node.
    'max_features': ['sqrt', 'log2']    # Number of features to consider when looking for the best split.
}

# Instantiate a base Random Forest classifier.
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV to find the best combination of hyperparameters using 3-fold cross-validation.
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,                  # 3-fold cross-validation.
    scoring='accuracy',    # Use accuracy as the evaluation metric.
    verbose=2,             # Print detailed logs during training.
    n_jobs=-1              # Use all available CPU cores.
)

# Fit the model to the training data. This may take some time depending on the parameter grid size.
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters found by GridSearchCV.
print("Best Parameters:", grid_search.best_params_)

# Make predictions on the test set using the best estimator.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the tuned model using accuracy and classification report.
print("Tuned Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Tuned Accuracy: 0.7504774029280713
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74      7090
         1.0       0.73      0.80      0.76      7049

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.75     14139
weighted avg       0.75      0.75      0.75     14139



In [None]:
# Save model to file
joblib.dump(best_model, 'RandomForest_model.pkl')
print("Model saved to 'RandomForest_model.pkl'")

Model saved to 'RandomForest_model.pkl'


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr_model.predict(X_test)
print("🔹 Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Save model
joblib.dump(lr_model, "logistic_regression_model.pkl")
print("Saved as logistic_regression_model.pkl")

🔹 Logistic Regression Accuracy: 0.7484263384963576
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74      7090
         1.0       0.74      0.77      0.75      7049

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.75     14139
weighted avg       0.75      0.75      0.75     14139

Saved as logistic_regression_model.pkl


K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import joblib

# Define hyperparameter grid for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan']
}

# Perform grid search with 3-fold CV
grid_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid_knn,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit model
grid_knn.fit(X_train, y_train)

# Evaluate best model
print("🔹 Best KNN Parameters:", grid_knn.best_params_)
best_knn = grid_knn.best_estimator_
y_pred_knn = best_knn.predict(X_test)

# Print metrics
print("KNN Accuracy (tuned):", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

# Save model
joblib.dump(best_knn, "knn_model.pkl")
print("Saved as knn_model.pkl")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
🔹 Best KNN Parameters: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'uniform'}
KNN Accuracy (tuned): 0.7231770280783648
              precision    recall  f1-score   support

         0.0       0.74      0.69      0.71      7090
         1.0       0.71      0.75      0.73      7049

    accuracy                           0.72     14139
   macro avg       0.72      0.72      0.72     14139
weighted avg       0.72      0.72      0.72     14139

Saved as knn_model.pkl


XGB

In [None]:
from xgboost import XGBClassifier

# Define hyperparameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform grid search with 3-fold CV
grid_xgb = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid_xgb,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit model
grid_xgb.fit(X_train, y_train)

# Evaluate best model
print("🔹 Best XGBoost Parameters:", grid_xgb.best_params_)
best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Print metrics
print("XGBoost Accuracy (tuned):", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

# Save model
joblib.dump(best_xgb, "xgb_model.pkl")
print("Saved as xgb_model.pkl")

Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.



🔹 Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
XGBoost Accuracy (tuned): 0.7564891435037838
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.75      7090
         1.0       0.73      0.80      0.77      7049

    accuracy                           0.76     14139
   macro avg       0.76      0.76      0.76     14139
weighted avg       0.76      0.76      0.76     14139

Saved as xgb_model.pkl


Currently, XGBoost performs better.

Add scaler.


In [12]:
from sklearn.metrics import roc_auc_score

# Define hyperparameter grid
param_grid_xgb = {
    'n_estimators': [100, 150],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.8, 0.9]
}

# Grid search with AUC as scoring
grid_xgb = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid_xgb,
    cv=3,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train, y_train)

# Best model
best_xgb = grid_xgb.best_estimator_
print("🔹 Best XGBoost Parameters:", grid_xgb.best_params_)

# Predict probabilities
y_proba = best_xgb.predict_proba(X_test)[:, 1]

# Use custom threshold
threshold = 0.45  # <--- You can adjust this based on precision/recall tradeoff
y_pred_thresh = (y_proba >= threshold).astype(int)

# Evaluate
print(f"\n🔸 Custom Threshold = {threshold}")
print("Accuracy:", accuracy_score(y_test, y_pred_thresh))
print("AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred_thresh))

# Save model and scaler
joblib.dump(best_xgb, "xgb_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("✅ Saved model as xgb_model.pkl and scaler as scaler.pkl")

Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.



🔹 Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150, 'subsample': 1.0}

🔸 Custom Threshold = 0.45
Accuracy: 0.7530235518777848
AUC: 0.8325549483256534
              precision    recall  f1-score   support

         0.0       0.81      0.67      0.73      7090
         1.0       0.71      0.84      0.77      7049

    accuracy                           0.75     14139
   macro avg       0.76      0.75      0.75     14139
weighted avg       0.76      0.75      0.75     14139

✅ Saved model as xgb_model.pkl and scaler as scaler.pkl
