<a href="https://www.kaggle.com/code/sutariyasmit01/playground-s6-ep2?scriptVersionId=296509191" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")


In [None]:
train_df = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df  = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")


In [None]:
y = train_df["Heart Disease"].map({
    "Presence": 1,
    "Absence": 0
})

X = train_df.drop(columns=["id", "Heart Disease"])
X_test = test_df.drop(columns=["id"])


In [None]:
def feature_engineering(df):
    df = df.copy()

    df["Age_sq"] = df["Age"] ** 2
    df["Chol_BP"] = df["Cholesterol"] * df["BP"]
    df["Chol_by_Age"] = df["Cholesterol"] / (df["Age"] + 1)
    df["BP_by_HR"] = df["BP"] / (df["Max HR"] + 1)

    df["High_Risk"] = (
        (df["Age"] > 55) &
        (df["Cholesterol"] > 240) &
        (df["BP"] > 140)
    ).astype(int)

    return df


In [None]:
X_fe = feature_engineering(X)
X_test_fe = feature_engineering(X_test)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_fe,
    y,
    test_size=0.15,
    stratify=y,
    random_state=42
)


In [1]:
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,

    "num_leaves": 31,
    "max_depth": 10,
    "min_data_in_leaf": 50,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,

    "verbosity": -1,
    "seed": 42
}


In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data   = lgb.Dataset(X_val, label=y_val)

lgb_model = lgb.train(
    params=lgb_params,
    train_set=train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50)]
)


In [None]:
lgb_val_preds = lgb_model.predict(X_val)
lgb_auc = roc_auc_score(y_val, lgb_val_preds)

print(f"LightGBM Validation ROC-AUC: {lgb_auc:.6f}")


In [None]:
from sklearn.model_selection import RandomizedSearchCV


In [None]:
cat_base = CatBoostClassifier(
    iterations=400,          
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=0
)


In [None]:
param_dist_cat = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.03, 0.05, 0.08, 0.1],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "bagging_temperature": [0, 0.5, 1, 2]
}


In [None]:
cat_search = RandomizedSearchCV(
    estimator=cat_base,
    param_distributions=param_dist_cat,
    n_iter=12,            
    scoring="roc_auc",
    cv=3,                 
    verbose=2,
    random_state=42,
    n_jobs=-1
)


In [2]:
cat_search.fit(X_fe, y)


NameError: name 'cat_search' is not defined

In [None]:
print("Best CatBoost CV ROC-AUC:", cat_search.best_score_)
print("Best CatBoost parameters:")
for k, v in cat_search.best_params_.items():
    print(f"  {k}: {v}")


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_fe,
    y,
    test_size=0.15,
    stratify=y,
    random_state=42
)


In [None]:
best_cat_params = cat_search.best_params_

cat_model = CatBoostClassifier(
    iterations=3000,             
    learning_rate=best_cat_params["learning_rate"],
    depth=best_cat_params["depth"],
    l2_leaf_reg=best_cat_params["l2_leaf_reg"],
    bagging_temperature=best_cat_params["bagging_temperature"],

    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,

    early_stopping_rounds=100,
    verbose=200
)


In [None]:
cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val)
)


In [None]:
cat_val_preds = cat_model.predict_proba(X_val)[:, 1]
cat_auc = roc_auc_score(y_val, cat_val_preds)

print(f"Final CatBoost Validation ROC-AUC: {cat_auc:.6f}")


In [None]:
ensemble_val_preds = 0.6 * cat_val_preds + 0.4 * lgb_val_preds
ensemble_auc = roc_auc_score(y_val, ensemble_val_preds)

print(f"CatBoost + LGBM Ensemble ROC-AUC: {ensemble_auc:.6f}")


In [None]:

lgb_test_preds = lgb_model.predict(X_test_fe)

cat_test_preds = cat_model.predict_proba(X_test_fe)[:, 1]


In [None]:
ensemble_test_preds = (
    0.4 * lgb_test_preds +
    0.6 * cat_test_preds
)


In [None]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "Heart Disease": ensemble_test_preds
})

submission.to_csv("submission.csv", index=False)
