### Read Dataset

In [2]:
import pandas as pd
df=pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
df.shape

(100000, 9)

### LabelEncoding for Categorical Varaibles

In [5]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['gender']=le.fit_transform(df['gender'])
print(le.inverse_transform([0,1,2]))
df['smoking_history']=le.fit_transform(df['smoking_history'])
print(le.inverse_transform([0,1,2,3,4,5]))

['Female' 'Male' 'Other']
['No Info' 'current' 'ever' 'former' 'never' 'not current']


### Identifying X and Y

In [7]:
x=df.iloc[:,0:8]
y=df.diabetes

### Variable Importance

In [9]:
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier()
clf.fit(x,y)
for feat, importance in zip(df.columns, clf.feature_importances_):
    print(feat,"importance:",importance*100)

gender importance: 1.1416627754932012
age importance: 6.811544330892582
hypertension importance: 0.6095298214696825
heart_disease importance: 0.49796277522071253
smoking_history importance: 2.6338484228896655
bmi importance: 12.32533286143054
HbA1c_level importance: 49.080475052901484
blood_glucose_level importance: 26.899643959702125


### Train Test Split

In [11]:
from sklearn.model_selection import train_test_split
x=df[['gender', 'age', 'smoking_history','bmi', 'HbA1c_level', 'blood_glucose_level']]
y=df['diabetes']
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1,shuffle=True)

In [12]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
print(lr.score(X_test,y_test))

0.9602


In [13]:
!pip install xgboost catboost lightgbm 

Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting lightgbm
  Using cached lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Using cached xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 281.8 kB/s eta 0:06:01
   ---------------------------------------- 0.1/101.7 MB 853.3 kB/s eta 0:02:00
   ---------------------------------------- 0.6/101.7 MB 3.2 MB/s eta 0:00:32
   ---------------------------------------- 1.0/101

In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Step 1: Define the base models
xgb = XGBClassifier()
catboost = CatBoostClassifier(silent=True)
lgbm = LGBMClassifier()

# Step 2: Define the meta-model (Logistic Regression)
meta_model = AdaBoostClassifier()

# Step 3: Create the stacking ensemble
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('catboost', catboost),
        ('lgbm', lgbm)
    ],
    final_estimator=meta_model,
    cv=5
)

# Step 4: Train the stacking model
stacking_model.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = stacking_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking Model Accuracy: {accuracy:.4f}")


[LightGBM] [Info] Number of positive: 6799, number of negative: 73201
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 405
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.084987 -> initscore=-2.376434
[LightGBM] [Info] Start training from score -2.376434
[LightGBM] [Info] Number of positive: 5440, number of negative: 58560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 405
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085000 -> initscore=-2.376273
[LightGB

In [16]:

ypred=stacking_model.predict(X_test)
from sklearn.metrics import precision_score
precision_score(y_test,ypred)

1.0

In [18]:
from sklearn.metrics import roc_auc_score,classification_report
roc_auc_score(y_test,ypred)

0.8383303938859494

In [22]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18299
           1       1.00      0.68      0.81      1701

    accuracy                           0.97     20000
   macro avg       0.99      0.84      0.90     20000
weighted avg       0.97      0.97      0.97     20000

