In [10]:
!pip install catboost



In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")


In [12]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

TARGET = "Heart Disease"   # change if target column name differs
ID_COL = "id"       # change if id column differs

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (630000, 15)
Test shape: (270000, 14)


In [13]:
print("Before cleaning:", train[TARGET].unique())

train[TARGET] = train[TARGET].replace({
    "Presence": 1,
    "Absence": 0,
    "Ab": 0
})

train[TARGET] = train[TARGET].astype(int)

print("After cleaning:", train[TARGET].unique())

y = train[TARGET]

Before cleaning: ['Presence' 'Absence']
After cleaning: [1 0]


In [14]:
test_ids = test[ID_COL]

In [15]:
train.drop(columns=[TARGET, ID_COL], inplace=True)
test.drop(columns=[ID_COL], inplace=True)

In [16]:
full = pd.concat([train, test], axis=0)

for col in full.columns:
    if full[col].dtype == "object":
        le = LabelEncoder()
        full[col] = le.fit_transform(full[col].astype(str))

train = full.iloc[:len(y)].reset_index(drop=True)
test = full.iloc[len(y):].reset_index(drop=True)

X = train.copy()

print("Preprocessing Done.")

Preprocessing Done.


In [17]:
print("X shape:", X.shape)
print("Test shape:", test.shape)
print("y unique values:", y.unique())
print("y dtype:", y.dtype)
print("Any NaNs in X?", X.isnull().sum().sum())

X shape: (630000, 13)
Test shape: (270000, 13)
y unique values: [1 0]
y dtype: int64
Any NaNs in X? 0


In [19]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    print(f"\n====== Fold {fold+1} ======")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=3000,
        learning_rate=0.02,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=200
    )

    oof_xgb[val_idx] = model.predict_proba(X_val)[:, 1]
    test_xgb += model.predict_proba(test)[:, 1] / 5

print("CV LogLoss:", log_loss(y, oof_xgb))


[0]	validation_0-logloss:0.67562
[200]	validation_0-logloss:0.27680
[400]	validation_0-logloss:0.26968
[600]	validation_0-logloss:0.26824
[800]	validation_0-logloss:0.26754
[1000]	validation_0-logloss:0.26729
[1200]	validation_0-logloss:0.26721
[1400]	validation_0-logloss:0.26719
[1600]	validation_0-logloss:0.26726
[1800]	validation_0-logloss:0.26735
[2000]	validation_0-logloss:0.26747
[2200]	validation_0-logloss:0.26759
[2400]	validation_0-logloss:0.26773
[2600]	validation_0-logloss:0.26788
[2800]	validation_0-logloss:0.26803
[2999]	validation_0-logloss:0.26818

[0]	validation_0-logloss:0.67566
[200]	validation_0-logloss:0.27931
[400]	validation_0-logloss:0.27241
[600]	validation_0-logloss:0.27107
[800]	validation_0-logloss:0.27051
[1000]	validation_0-logloss:0.27034
[1200]	validation_0-logloss:0.27035
[1400]	validation_0-logloss:0.27040
[1600]	validation_0-logloss:0.27050
[1800]	validation_0-logloss:0.27061
[2000]	validation_0-logloss:0.27074
[2200]	validation_0-logloss:0.27087
[240

In [21]:
submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET: test_xgb
})

submission.to_csv("submission.csv", index=False)

print("submission.csv created successfully!")

submission.csv created successfully!


In [22]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>