The **preprocessing** steps (removing outliers and one-hot encoding categorical variables) have already been accomplished in the Random Forest Model Training so we'll just load in the cleaned dataset

In [1]:
# Making sure XGBoost is installed in the environment
!pip install xgboost

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.
Defaulting to user installation because normal site-packages is not writeable


In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Load the preprocessed and cleaned dataset
data_filepath = '../data/processed/cleaned_diabetes_one_hot_encoding.csv'
df = pd.read_csv(data_filepath)
print("\nThese are the number of people with and without diabetes")
print(df['diabetes'].value_counts())

    age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0  80.0             0              1  25.19          6.6                  140   
1  54.0             0              0  27.32          6.6                   80   
2  28.0             0              0  27.32          5.7                  158   
3  36.0             0              0  23.45          5.0                  155   
4  76.0             1              1  20.14          4.8                  155   

   diabetes  gender_Female  gender_Male  gender_Other  \
0         0              1            0             0   
1         0              1            0             0   
2         0              0            1             0   
3         0              1            0             0   
4         0              0            1             0   

   smoking_history_No Info  smoking_history_current  smoking_history_former  \
0                        0                        0                       0   
1           

In [6]:
print("This is the Data distribution of all variables in the dataset")
print(df.describe()) # Data distribution of variables

This is the Data distribution of all variables in the dataset
                age  hypertension  heart_disease           bmi   HbA1c_level  \
count  89237.000000  89237.000000   89237.000000  89237.000000  89237.000000   
mean      41.665855      0.069086       0.037484     26.357026      5.511976   
std       22.777219      0.253601       0.189947      4.901841      1.060805   
min        0.080000      0.000000       0.000000     14.710000      3.500000   
25%       23.000000      0.000000       0.000000     23.420000      4.800000   
50%       42.000000      0.000000       0.000000     27.320000      5.800000   
75%       60.000000      0.000000       0.000000     28.350000      6.200000   
max       80.000000      1.000000       1.000000     38.500000      9.000000   

       blood_glucose_level      diabetes  gender_Female   gender_Male  \
count         89237.000000  89237.000000   89237.000000  89237.000000   
mean            137.445096      0.074319       0.580062      0.419759  

In [8]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,0,1,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,0,1,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,0,1,0,0,0


## Model Building - Initial

In [9]:
# Separating the features (X) and the target variable (y)
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [11]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Converting the dataset into XGBoost's DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
# Specifying the XGBoost model parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',  # Evaluation metric
    'max_depth': 6,  # Depth of the trees
    'eta': 0.3,  # Learning rate
    'seed': 42  # Random seed for reproducibility
}
num_rounds = 100  # Number of boosting rounds

# Training the XGBoost model
bst = xgb.train(params, dtrain, num_rounds)

In [13]:
# Making predictions on the test set
y_pred_prob = bst.predict(dtest)
y_pred = np.where(y_pred_prob > 0.5, 1, 0)  # Thresholding to get binary predictions

In [14]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)  # Use probabilities to compute ROC AUC
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Displaying the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.972658000896459
ROC AUC: 0.9749866407052354
Confusion Matrix:
[[16467    38]
 [  450   893]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     16505
           1       0.96      0.66      0.79      1343

    accuracy                           0.97     17848
   macro avg       0.97      0.83      0.89     17848
weighted avg       0.97      0.97      0.97     17848

