In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e2/sample_submission.csv
/kaggle/input/playground-series-s6e2/train.csv
/kaggle/input/playground-series-s6e2/test.csv


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [3]:
train_df = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df  = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

test_df.head()

train_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [4]:
TARGET = "Heart Disease"
ID_COL = "id"

X = train_df.drop([TARGET, ID_COL], axis=1)
y = train_df[TARGET]

X_test = test_df.drop(ID_COL, axis=1)

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

y = pd.Series(y, name=TARGET)

dict(zip(le.classes_, le.transform(le.classes_)))

{'Absence': np.int64(0), 'Presence': np.int64(1)}

In [6]:
cat_cols = [
    "Sex",
    "Chest pain type",
    "FBS over 120",
    "EKG results",
    "Exercise angina",
    "Slope of ST",
    "Thallium"
]

cat_idx = [X.columns.get_loc(col) for col in cat_cols]

In [7]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

lgb = LGBMClassifier(
    n_estimators=300,
    num_leaves=31,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cat = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.05,
    cat_features=cat_idx,
    loss_function="Logloss",
    verbose=0,
    random_state=42
)

models = {
    "XGBoost": xgb,
    "LightGBM": lgb,
    "CatBoost": cat
}


In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}

for name, model in models.items():
    fold_losses = []

    for tr_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model.fit(X_tr, y_tr)
        val_prob = model.predict_proba(X_val)[:, 1]
        fold_losses.append(log_loss(y_val, val_prob))

    cv_results[name] = np.mean(fold_losses)

cv_results_df = pd.DataFrame.from_dict(
    cv_results, orient="index", columns=["CV Log Loss"]
)

cv_results_df


[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 422
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
[LightGB

Unnamed: 0,CV Log Loss
XGBoost,0.2688
LightGBM,0.268977
CatBoost,0.269734


In [9]:
for model in models.values():
    model.fit(X, y)

[LightGBM] [Info] Number of positive: 282454, number of negative: 347546
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448340 -> initscore=-0.207381
[LightGBM] [Info] Start training from score -0.207381


In [10]:
test_probs = {}

for name, model in models.items():
    test_probs[name] = model.predict_proba(X_test)[:, 1]

In [11]:
ensemble_avg_prob = np.mean(
    np.column_stack(list(test_probs.values())),
    axis=1
)

In [12]:
weights = [0.3, 0.3, 0.4]

ensemble_weighted_prob = (
    weights[0]*test_probs["XGBoost"] +
    weights[1]*test_probs["LightGBM"] +
    weights[2]*test_probs["CatBoost"]
)


In [13]:
blend_train = np.column_stack([
    models["XGBoost"].predict_proba(X)[:,1],
    models["LightGBM"].predict_proba(X)[:,1],
    models["CatBoost"].predict_proba(X)[:,1]
])

blend_test = np.column_stack([
    test_probs["XGBoost"],
    test_probs["LightGBM"],
    test_probs["CatBoost"]
])


In [14]:
meta = LogisticRegression()
meta.fit(blend_train, y)

blend_prob = meta.predict_proba(blend_test)[:,1]


In [15]:
final_prob = blend_prob   # best generalization
# final_prob = ensemble_avg_prob
# final_prob = ensemble_weighted_prob

final_pred = (final_prob > 0.5).astype(int)

In [16]:
final_label = le.inverse_transform(final_pred)

In [17]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "Heart Disease": final_label
})

submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,id,Heart Disease
0,630000,Presence
1,630001,Absence
2,630002,Presence
3,630003,Absence
4,630004,Absence
