# Diabetes AI Model

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb
import joblib


# Pre processing of the data

In [2]:
df = pd.read_csv('data/diabetes_dataset.csv')

In [3]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [3]:
df['smoking_history'] = df['smoking_history'].replace("No Info", "unknown")


In [4]:
label_encoders = {}
categorical_cols = ['gender', 'smoking_history']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Very Important - save the encoder

In [20]:
joblib.dump(label_encoders, "diabetes_label_encoders.pkl")

['diabetes_label_encoders.pkl']

In [8]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,3,25.19,6.6,140,0
1,0,54.0,0,0,5,27.32,6.6,80,0
2,1,28.0,0,0,3,27.32,5.7,158,0
3,0,36.0,0,0,0,23.45,5.0,155,0
4,1,76.0,1,1,0,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,0,80.0,0,0,5,27.32,6.2,90,0
99996,0,2.0,0,0,5,17.37,6.5,100,0
99997,1,66.0,0,0,2,27.83,5.7,155,0
99998,0,24.0,0,0,3,35.42,4.0,100,0


In [8]:
scaler = StandardScaler()
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Save the standard scaler

In [21]:
joblib.dump(scaler, "diabetes_scaler.pkl")

['diabetes_scaler.pkl']

In [10]:
df.to_csv('data/processed_diabetes_data.csv', index=False)

## saved the preprocessed data

In [11]:
df = pd.read_csv('data/processed_diabetes_data.csv')

In [25]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,1.692704,0,1,3,-0.321056,1.001706,0.047704,0
1,0,0.538006,0,0,5,-0.000116,1.001706,-1.426210,0
2,1,-0.616691,0,0,3,-0.000116,0.161108,0.489878,0
3,0,-0.261399,0,0,0,-0.583232,-0.492690,0.416183,0
4,1,1.515058,1,1,0,-1.081970,-0.679490,0.416183,0
...,...,...,...,...,...,...,...,...,...
99995,0,1.692704,0,0,5,-0.000116,0.628107,-1.180558,0
99996,0,-1.771388,0,0,5,-1.499343,0.908306,-0.934905,0
99997,1,1.070944,0,0,2,0.076729,0.161108,0.416183,0
99998,0,-0.794336,0,0,3,1.220361,-1.426688,-0.934905,0


In [26]:
label_encoders["smoking_history"].classes_

array(['current', 'ever', 'former', 'never', 'not current', 'unknown'],
      dtype=object)

In [12]:
X = df.drop(columns=['diabetes'])  # Features
y = df['diabetes']  # Target (0/1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# The Model - XGBoost

In [15]:
xgb_model = xgb.XGBClassifier(
    n_estimators=100,  # Number of trees
    learning_rate=0.05,  # Step size
    max_depth=6,  # Tree depth
    subsample=0.8,  # Prevent overfitting
    colsample_bytree=0.8,  # Feature selection
    eval_metric="logloss",  # Loss function
    use_label_encoder=False
)


In [16]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
# Predictions
y_pred = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print metrics
print(f"🔹 Accuracy: {accuracy:.4f}")
print(f"🔹 Precision: {precision:.4f}")
print(f"🔹 Recall: {recall:.4f}")
print(f"🔹 F1-score: {f1:.4f}")
print(f"🔹 AUC-ROC: {roc_auc:.4f}")

🔹 Accuracy: 0.9723
🔹 Precision: 0.9848
🔹 Recall: 0.6847
🔹 F1-score: 0.8078
🔹 AUC-ROC: 0.8419


## Satisfactory good results from XGBoost

## Saving the model

In [22]:
import joblib

# Save the model
joblib.dump(xgb_model, "diabetes_xgboost.pkl")

print("✅ Model saved successfully!")


✅ Model saved successfully!


In [23]:
# Load the model
xgb_model = joblib.load("diabetes_xgboost.pkl")

print("✅ Model loaded successfully!")


✅ Model loaded successfully!


## Trying out some other models

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define models
models = {
    "XGBoost": xgb.XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=8, 
                                 n_estimators=200, subsample=0.8, eval_metric="logloss"),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=8, random_state=42),
    "LightGBM": lgb.LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=8),
    "Logistic Regression": LogisticRegression(max_iter=500, solver="lbfgs")
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    results[name] = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1-score": f1, "AUC-ROC": roc_auc}

# Display results
for model, metrics in results.items():
    print(f"\n🚀 **{model}**")
    print(f"🔹 Accuracy: {metrics['Accuracy']:.4f}")
    print(f"🔹 Precision: {metrics['Precision']:.4f}")
    print(f"🔹 Recall: {metrics['Recall']:.4f}")
    print(f"🔹 F1-score: {metrics['F1-score']:.4f}")
    print(f"🔹 AUC-ROC: {metrics['AUC-ROC']:.4f}")


[LightGBM] [Info] Number of positive: 6800, number of negative: 73200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085000 -> initscore=-2.376273
[LightGBM] [Info] Start training from score -2.376273

🚀 **XGBoost**
🔹 Accuracy: 0.9705
🔹 Precision: 0.9405
🔹 Recall: 0.6971
🔹 F1-score: 0.8007
🔹 AUC-ROC: 0.8465

🚀 **Random Forest**
🔹 Accuracy: 0.9723
🔹 Precision: 1.0000
🔹 Recall: 0.6741
🔹 F1-score: 0.8053
🔹 AUC-ROC: 0.8371

🚀 **LightGBM**
🔹 Accuracy: 0.9713
🔹 Precision: 0.9548
🔹 Recall: 0.6953
🔹 F1-score: 0.8046
🔹 AUC-ROC: 0.8461

🚀 **Logistic Regression**
🔹 Accuracy: 0.9601
🔹 Precision: 0.8558
🔹 Recall: 0.6388
🔹 F1-score: 0.7316
🔹 AUC-ROC: 0