<a href="https://colab.research.google.com/github/Vivek-ML001/Diabetes-Prediction-Challenge-Playground-Series-S5E12/blob/main/diabetes_prediction_challenge_s5e12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes Prediction Challenge â€” Playground Series S5E12

## ðŸŽ¯ Goal:
### In this notebook, we explore the dataset provided in the Diabetes Prediction Challenge and build machine learning models to predict whether a patient is likely to be diagnosed with diabetes.  
The focus is on clean preprocessing, proper validation, and improving performance using tree-based models.


## Import Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")


  if entities is not ():


## load the data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test  = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

print(train.shape)
print(test.shape)


(700000, 26)
(300000, 25)


## Understand the Columns

In [None]:
train.head()


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [None]:
test.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,Male,White,Highschool,Low,Never,Employed,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,Male,White,Graduate,Middle,Former,Employed,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,Male,White,Graduate,Low,Current,Unemployed,0,0,0


## Separate Features and Target

In [None]:
# Re-create X, y, X_test cleanly
X = train.drop(columns=["id", "diagnosed_diabetes"])
y = train["diagnosed_diabetes"]

X_test = test.drop(columns=["id"])



In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = X.select_dtypes(include="object").columns
print("Categorical columns:", cat_cols)

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


Categorical columns: Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'smoking_status', 'employment_status'],
      dtype='object')


In [None]:
X.dtypes


age                                     int64
alcohol_consumption_per_week            int64
physical_activity_minutes_per_week      int64
diet_score                            float64
sleep_hours_per_day                   float64
screen_time_hours_per_day             float64
bmi                                   float64
waist_to_hip_ratio                    float64
systolic_bp                             int64
diastolic_bp                            int64
heart_rate                              int64
cholesterol_total                       int64
hdl_cholesterol                         int64
ldl_cholesterol                         int64
triglycerides                           int64
gender                                  int64
ethnicity                               int64
education_level                         int64
income_level                            int64
smoking_status                          int64
employment_status                       int64
family_history_diabetes           

## Handle Categorical Features

In [None]:
cat_cols = X.select_dtypes(include="object").columns
cat_cols


Index([], dtype='object')

## Set Cross-Validation Strategy

In [None]:
skf = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


## Train LightGBM with Cross-Validation

In [None]:
lgb_auc = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=2500,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="binary",
        random_state=42 + fold,
        verbosity=-1
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc"
    )

    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)

    lgb_auc.append(auc)
    print(f"Fold {fold+1} AUC: {auc:.5f}")

print("Mean CV AUC:", np.mean(lgb_auc))


Fold 1 AUC: 0.72702
Fold 2 AUC: 0.72541
Fold 3 AUC: 0.72667
Fold 4 AUC: 0.72728
Fold 5 AUC: 0.72780
Mean CV AUC: 0.7268351395804421


## Train Final Model on Full Data

In [None]:
final_model = lgb.LGBMClassifier(
    n_estimators=2500,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    random_state=42,
    verbosity=-1
)

final_model.fit(X, y)


## Predict on Test Data

In [None]:
test_preds = final_model.predict_proba(X_test)[:, 1]


## submission the file

In [None]:
submission = pd.DataFrame({
    "id": test["id"],
    "diagnosed_diabetes": test_preds
})

submission.to_csv("submission.csv", index=False)


In [None]:
submission.head()

Unnamed: 0,id,diagnosed_diabetes
0,700000,0.481989
1,700001,0.677276
2,700002,0.78833
3,700003,0.362906
4,700004,0.90738


In [None]:
submission.tail()

Unnamed: 0,id,diagnosed_diabetes
299995,999995,0.732859
299996,999996,0.612048
299997,999997,0.577998
299998,999998,0.650334
299999,999999,0.629441
