In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold,GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix,roc_auc_score, roc_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [7]:
train_df = pd.read_csv('train_diab.csv')
test_df = pd.read_csv('test_diab.csv')
sub = pd.read_csv('sample_submission.csv')

In [8]:
TARGET = "diagnosed_diabetes"
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

test_ids = test_df["id"]

In [9]:
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)


X[cat_cols] = enc.fit_transform(X[cat_cols])
test_df[cat_cols] = enc.transform(test_df[cat_cols])

In [10]:
KF = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_xgb = np.zeros(len(X))

pred_lgb = np.zeros(len(test_df))
pred_cat = np.zeros(len(test_df))
pred_xgb = np.zeros(len(test_df))

In [11]:
for fold, (trn_idx, val_idx) in enumerate(KF.split(X, y)):
    print(f"\n===== FOLD {fold+1} / 5 =====")

    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]

    # LightGBM
    lgb = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.02,
        num_leaves=64,
        colsample_bytree=0.8,
        subsample=0.8,
        random_state=42,
        class_weight="balanced"
    )
    lgb.fit(X_train, y_train)

    oof_lgb[val_idx] = lgb.predict_proba(X_valid)[:, 1]
    pred_lgb += lgb.predict_proba(test_df)[:, 1] / KF.n_splits

    # CatBoost
    cat = CatBoostClassifier(
    iterations=2000,
    depth=6,
    learning_rate=0.03,
    l2_leaf_reg=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=False)
    cat.fit(X_train, y_train)
    oof_cat[val_idx] = cat.predict_proba(X_valid)[:, 1]
    pred_cat += cat.predict_proba(test_df)[:, 1] / KF.n_splits

    # XGBoost
    xgb = XGBClassifier(
        n_estimators=2000,
        learning_rate=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="auc",
        random_state=42,
        tree_method="hist"
    )
    xgb.fit(X_train, y_train)

    oof_xgb[val_idx] = xgb.predict_proba(X_valid)[:, 1]
    pred_xgb += xgb.predict_proba(test_df)[:, 1] / KF.n_splits


===== FOLD 1 / 5 =====
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1895
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

===== FOLD 2 / 5 =====
[LightGBM] [Info] Number of positive: 349045, number of negative: 210955
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1896
[LightGBM] [Info] Number of data points in the train set: 

In [24]:
oof_blend = 0.4 * oof_lgb + 0.35 * oof_cat + 0.25 * oof_xgb
pred_blend = 0.4 * pred_lgb + 0.35 * pred_cat + 0.25 * pred_xgb

In [25]:
print("\nLightGBM ROC:", roc_auc_score(y, oof_lgb))
print("CatBoost ROC:", roc_auc_score(y, oof_cat))
print("XGBoost ROC:", roc_auc_score(y, oof_xgb))
print("Blended ROC:", roc_auc_score(y, oof_blend))


LightGBM ROC: 0.7271809204666989
CatBoost ROC: 0.725814100943837
XGBoost ROC: 0.7264871649112523
Blended ROC: 0.7273611194537963


In [26]:
# Stacking

stack_train = np.vstack([oof_lgb, oof_cat, oof_xgb]).T
stack_test  = np.vstack([pred_lgb, pred_cat, pred_xgb]).T

lvl2 = LogisticRegression(max_iter=2000)
lvl2.fit(stack_train, y)

pred_final = lvl2.predict_proba(stack_test)[:, 1]

print("\nFinal Stacked ROC:",
      roc_auc_score(y, lvl2.predict_proba(stack_train)[:, 1]))


Final Stacked ROC: 0.7274533389878863


In [15]:
submission = pd.DataFrame({
    "id": test_ids,
    "diagnosed_diabetes": pred_final
})

submission.to_csv("submission3013251626.csv", index=False)
print("submission.csv saved successfully!")


submission.csv saved successfully!
