<a href="https://colab.research.google.com/github/Sornambal/Diabetes-Prediction-Challenge/blob/main/Diabetes_Prediction_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===================================
# IMPORTS
# ===================================
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

import lightgbm as lgb

# ===================================
# LOAD DATA
# ===================================
train = pd.read_csv("/content/train (1).csv")
test = pd.read_csv("/content/test (1).csv")

TARGET = "diagnosed_diabetes"
ID_COL = "id"

# Remove rows without target
train = train.dropna(subset=[TARGET]).reset_index(drop=True)

X = train.drop([TARGET, ID_COL], axis=1)
y = train[TARGET]
X_test = test.drop(ID_COL, axis=1)

# ===================================
# CATEGORICAL ENCODING (TOP METHOD)
# ===================================
cat_cols = X.select_dtypes(include="object").columns.tolist()

encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

encoder.fit(pd.concat([X[cat_cols], X_test[cat_cols]]))

X[cat_cols] = encoder.transform(X[cat_cols])
X_test[cat_cols] = encoder.transform(X_test[cat_cols])

# ===================================
# STRATIFIED CV
# ===================================
N_SPLITS = 10   # higher = more stable LB
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

test_preds = np.zeros(len(X_test))
auc_scores = []

# ===================================
# TRAIN
# ===================================
for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nüöÄ Fold {fold+1}/{N_SPLITS}")

    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=5000,
        learning_rate=0.01,
        num_leaves=96,
        max_depth=-1,
        min_child_samples=50,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.3,
        reg_lambda=0.7,
        objective="binary",
        random_state=42,
        force_row_wise=True
    )

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(300)]
    )

    val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_pred)
    auc_scores.append(auc)

    print(f"Fold AUC: {auc:.5f}")

    test_preds += model.predict_proba(X_test)[:, 1] / N_SPLITS

print("\n‚úÖ Mean CV AUC:", np.mean(auc_scores))

# ===================================
# SUBMISSION
# ===================================
submission = pd.DataFrame({
    "id": test[ID_COL],
    "diagnosed_diabetes": test_preds
})

submission.to_csv("submission.csv", index=False)
print("üèÅ submission.csv created")



üöÄ Fold 1/10
[LightGBM] [Info] Number of positive: 392676, number of negative: 237324
[LightGBM] [Info] Total Bins 1643
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503559
[LightGBM] [Info] Start training from score 0.503559
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[4327]	valid_0's auc: 0.727816	valid_0's binary_logloss: 0.582205
Fold AUC: 0.72782

üöÄ Fold 2/10
[LightGBM] [Info] Number of positive: 392676, number of negative: 237324
[LightGBM] [Info] Total Bins 1642
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623295 -> initscore=0.503559
[LightGBM] [Info] Start training from score 0.503559
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[3773]	valid_0's auc: 0.