## Diabetes Prediction App

In [None]:
# Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_curve,
    auc,
    confusion_matrix,
)
from xgboost import XGBClassifier
import joblib
import plotly.graph_objects as go
from tqdm import tqdm

In [2]:
# Loading the dataset

data = pd.read_csv("../data/diabetes_prediction_dataset.csv")

# View the first few rows of the dataset

data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
# Variable y in the dataset

data["diabetes"].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [4]:
# Summary statistics of the dataset

data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [5]:
# Creating a filter for the age column

filter_age = data["age"] >= 18

data = data[filter_age]

data["age"].describe()

count    82781.000000
mean        48.856682
std         18.014698
min         18.000000
25%         34.000000
50%         49.000000
75%         63.000000
max         80.000000
Name: age, dtype: float64

In [6]:
# Transform column age into int

data["age"] = data["age"].astype(int)

In [7]:
# Check basic information

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82781 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               82781 non-null  object 
 1   age                  82781 non-null  int64  
 2   hypertension         82781 non-null  int64  
 3   heart_disease        82781 non-null  int64  
 4   smoking_history      82781 non-null  object 
 5   bmi                  82781 non-null  float64
 6   HbA1c_level          82781 non-null  float64
 7   blood_glucose_level  82781 non-null  int64  
 8   diabetes             82781 non-null  int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 6.3+ MB


In [8]:
# Check for null values

data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [9]:
# Check for duplicate rows

print(data.duplicated().sum())

# Drop duplicate rows

data.drop_duplicates(inplace=True)

print("---Removed Duplicate---")
print(data.duplicated().sum())

3323
---Removed Duplicate---
0


In [10]:
# Incode the categorical columns

le_gender = LabelEncoder()
le_smoking = LabelEncoder()
data["gender"] = le_gender.fit_transform(data["gender"])
data["smoking_history"] = le_smoking.fit_transform(data["smoking_history"])

In [11]:
# Dataset after the column encoder

data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80,0,1,4,25.19,6.6,140,0
1,0,54,0,0,0,27.32,6.6,80,0
2,1,28,0,0,4,27.32,5.7,158,0
3,0,36,0,0,1,23.45,5.0,155,0
4,1,76,1,1,1,20.14,4.8,155,0


In [12]:
# Dataset info

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 79458 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               79458 non-null  int64  
 1   age                  79458 non-null  int64  
 2   hypertension         79458 non-null  int64  
 3   heart_disease        79458 non-null  int64  
 4   smoking_history      79458 non-null  int64  
 5   bmi                  79458 non-null  float64
 6   HbA1c_level          79458 non-null  float64
 7   blood_glucose_level  79458 non-null  int64  
 8   diabetes             79458 non-null  int64  
dtypes: float64(2), int64(7)
memory usage: 6.1 MB


In [13]:
# Selecting features and target variable

features = data.drop("diabetes", axis=1)

X = features
y = data["diabetes"]

In [14]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Scaling the features

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Applying PCA

pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [17]:
# Initializing and training the model

model = XGBClassifier(random_state=42)
model.fit(X_train_pca, y_train)

# Making predictions on the test set

y_pred = model.predict(X_test_pca)

# Evaluating the model

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:, {accuracy:.4f}")
print(f"Classification Report: \n {classification_report(y_test, y_pred)}")

Accuracy:, 0.9566
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     14209
           1       0.90      0.67      0.77      1683

    accuracy                           0.96     15892
   macro avg       0.93      0.83      0.87     15892
weighted avg       0.95      0.96      0.95     15892



In [None]:
# Bootstrap sampling to estimate accuracy and its confidence interval


def bootstrapping_accuracy(X_train_pca, y_train, n_iterations=1000):
    np.random.seed(42)
    accuracies = []
    n_size = int(len(X_train_pca) * 0.5)

    for _ in tqdm(range(n_iterations)):
        indices = np.random.randint(0, len(X_train_pca), n_size)
        X_sample = X_train_pca[indices]
        y_sample = y_train.iloc[indices]

        model_sample = XGBClassifier(random_state=42)
        model_sample.fit(X_sample, y_sample)

        y_pred_sample = model_sample.predict(X_test_pca)
        acc = accuracy_score(y_test, y_pred_sample)
        accuracies.append(acc)

    return np.mean(accuracies), np.std(accuracies)


mean_acc, std_acc = bootstrapping_accuracy(X_train_pca, y_train)
print(f"Bootstrapped Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")

100%|██████████| 1000/1000 [04:33<00:00,  3.65it/s]

Bootstrapped Accuracy: 0.9548 ± 0.0009





In [24]:
# Bootstrap the ROC curve

y_pred_proba = model.predict_proba(X_test_pca)[:, 1]
n_bootstraps = 10000
rng_seed = 42  # control reproducibility
bootstrapped_scores = []
rng = np.random.RandomState(rng_seed)

for i in range(n_bootstraps):
    indices = rng.randint(0, len(y_pred_proba), len(y_pred_proba))
    if len(np.unique(y_test.iloc[indices])) < 2:
        continue

    score = auc(*roc_curve(y_test.iloc[indices], y_pred_proba[indices])[:2])
    bootstrapped_scores.append(score)
bootstrapped_scores = np.array(bootstrapped_scores)
sorted_scores = np.sort(bootstrapped_scores)

# Computing the lower bound of the 95% confidence interval (2.5th percentile)

confidence_lower = sorted_scores[int(0.025 * len(sorted_scores))]

# Computing the upper bound of the 95% confidence interval (97.5th percentile)

confidence_upper = sorted_scores[int(0.975 * len(sorted_scores))]

print(
    f"95% confidence interval for the AUC score: [{confidence_lower:.3f} - {confidence_upper:.3f}]"
)

95% confidence interval for the AUC score: [0.960 - 0.967]


In [18]:
# Make predictions on the test set

y_pred_proba = model.predict_proba(X_test_pca)[:, 1]

# Calculate the false positive rate, true positive rate, and thresholds

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate the area under the ROC curve (AUC)

roc_auc = auc(fpr, tpr)

# Plot the ROC curve

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name="ROC curve (AUC = %0.2f)" % roc_auc))
fig.add_trace(
    go.Scatter(x=[0, 1], y=[0, 1], mode="lines", name="Random Classifier", line=dict(dash="dash"))
)
fig.update_layout(
    title="Receiver Operating Characteristic (ROC) Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    width=500,
    height=500,
    legend=dict(x=0, y=0, traceorder="normal"),
)
fig.show()

# Calculate the confusion matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

# Plot the confusion matrix

fig = go.Figure(
    data=go.Heatmap(
        z=cm,
        x=["Previsto 0", "Previsto 1"],
        y=["Real 0", "Real 1"],
        colorscale="Blues",
        hoverongaps=False,
    )
)

max_val = cm.max()
annotations = []
for i in range(len(cm)):
    for j in range(len(cm[0])):
        annotations.append(
            dict(
                x=j,
                y=i,
                text=str(cm[i][j]),
                showarrow=False,
                font=dict(color="white" if cm[i][j] > max_val / 2 else "black"),
            )
        )

fig.update_layout(
    title="Matriz de Confusão",
    width=500,
    height=500,
    xaxis_title="Classe Prevista",
    yaxis_title="Classe Real",
    annotations=annotations,
)

fig.show()

[[14078   131]
 [  559  1124]]


In [19]:
# Save feature names

feature_names = list(X.columns)
joblib.dump(feature_names, "../model/feature_names.pkl")

['../model/feature_names.pkl']

In [20]:
# Save all components

joblib.dump(model, "../model/diabetes_model.pkl")
joblib.dump(scaler, "../model/scaler.pkl")
joblib.dump(pca, "../model/pca.pkl")
joblib.dump(le_gender, "../model/gender_encoder.pkl")
joblib.dump(le_smoking, "../model/smoking_encoder.pkl")

['../model/smoking_encoder.pkl']