In [1]:
!pip install -q lightgbm catboost xgboost optuna

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

# Models
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [3]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

train.head()

Train shape: (630000, 15)
Test shape: (270000, 14)


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [4]:
TARGET = 'Heart Disease'
ID_COL = 'id'

y = train[TARGET]
X = train.drop(columns=[TARGET])
X_test = test.copy()

In [5]:
train.isnull().sum()

Unnamed: 0,0
id,0
Age,0
Sex,0
Chest pain type,0
BP,0
Cholesterol,0
FBS over 120,0
EKG results,0
Max HR,0
Exercise angina,0


In [14]:
#missing values

#  Drop rows where target is missing
train = train.dropna(subset=['Heart Disease']).reset_index(drop=True)

# separate again after drop
TARGET = 'Heart Disease'
ID_COL = 'id'

y = train[TARGET]
X = train.drop(columns=[TARGET])
X_test = test.copy()

# Fill numeric missing values using MEDIAN (best for medical data)
num_cols = X.select_dtypes(include=np.number).columns

for col in num_cols:
    median_value = X[col].median()
    X[col].fillna(median_value, inplace=True)
    X_test[col].fillna(median_value, inplace=True)

print("After handling missing values:")
print(X.isnull().sum())


After handling missing values:
id                         0
Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
dtype: int64


In [7]:
#Handling categorical columns
cat_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", list(cat_cols))

le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])
    le_dict[col] = le

Categorical columns: []


In [8]:
#Feature Engineering
def feature_engineering(df):

    # Example interactions
    if 'age' in df.columns and 'cholesterol' in df.columns:
        df['age_chol'] = df['age'] * df['cholesterol']

    if 'max_heart_rate' in df.columns and 'age' in df.columns:
        df['hr_age_ratio'] = df['max_heart_rate'] / (df['age'] + 1)

    if 'resting_bp' in df.columns and 'cholesterol' in df.columns:
        df['bp_chol_ratio'] = df['resting_bp'] / (df['cholesterol'] + 1)

    return df

X = feature_engineering(X)
X_test = feature_engineering(X_test)

In [9]:
#Stratified K fold setup
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [10]:
# remove broken preinstalled xgboost
!pip uninstall -y xgboost

# install stable version used on Kaggle
!pip install xgboost==1.7.6

Found existing installation: xgboost 1.7.6
Uninstalling xgboost-1.7.6:
  Successfully uninstalled xgboost-1.7.6
Collecting xgboost==1.7.6
  Using cached xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Using cached xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [16]:
#XGBoost Model
from xgboost.callback import EarlyStopping

xgb_oof = np.zeros(len(X))
xgb_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    print(f"\n--- XGBoost Fold {fold+1} ---")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.02,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='logloss',
        tree_method='hist',
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=200,
        early_stopping_rounds=100
    )

    xgb_oof[val_idx] = model.predict_proba(X_val)[:,1]
    xgb_test += model.predict_proba(X_test)[:,1] / N_SPLITS

print("XGB Log Loss:", log_loss(y, xgb_oof))





--- XGBoost Fold 1 ---
[0]	validation_0-logloss:0.68129
[200]	validation_0-logloss:0.27962
[400]	validation_0-logloss:0.27045
[600]	validation_0-logloss:0.26877
[800]	validation_0-logloss:0.26784
[999]	validation_0-logloss:0.26735

--- XGBoost Fold 2 ---
[0]	validation_0-logloss:0.68132
[200]	validation_0-logloss:0.28202
[400]	validation_0-logloss:0.27304
[600]	validation_0-logloss:0.27149
[800]	validation_0-logloss:0.27069
[999]	validation_0-logloss:0.27026

--- XGBoost Fold 3 ---
[0]	validation_0-logloss:0.68134
[200]	validation_0-logloss:0.28080
[400]	validation_0-logloss:0.27112
[600]	validation_0-logloss:0.26940
[800]	validation_0-logloss:0.26845
[999]	validation_0-logloss:0.26804

--- XGBoost Fold 4 ---
[0]	validation_0-logloss:0.68135
[200]	validation_0-logloss:0.28188
[400]	validation_0-logloss:0.27255
[600]	validation_0-logloss:0.27091
[800]	validation_0-logloss:0.27002
[999]	validation_0-logloss:0.26947

--- XGBoost Fold 5 ---
[0]	validation_0-logloss:0.68132
[200]	validatio

In [13]:
# convert target labels to numeric
train['Heart Disease'] = train['Heart Disease'].map({
    'Absence': 0,
    'Presence': 1
})

# verify
print(train['Heart Disease'].value_counts())


Heart Disease
0    347546
1    282454
Name: count, dtype: int64


In [17]:
#LightGBM Model
lgb_oof = np.zeros(len(X))
lgb_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    print(f"\n--- LightGBM Fold {fold+1} ---")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMClassifier(
        n_estimators=1500,
        learning_rate=0.02,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='logloss',
        callbacks=[lgb.early_stopping(100)]
    )

    lgb_oof[val_idx] = model.predict_proba(X_val)[:,1]
    lgb_test += model.predict_proba(X_test)[:,1] / N_SPLITS

print("LightGBM Log Loss:", log_loss(y, lgb_oof))



--- LightGBM Fold 1 ---
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.247175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 677
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1500]	valid_0's binary_logloss: 0.267357

--- LightGBM Fold 2 ---
[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.168962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 672
[LightGBM] [Info] Number of data p

In [18]:
#CatBoost Model
cat_oof = np.zeros(len(X))
cat_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

    print(f"\n--- CatBoost Fold {fold+1} ---")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        iterations=2000,
        learning_rate=0.02,
        depth=6,
        loss_function='Logloss',
        eval_metric='Logloss',
        random_state=42,
        verbose=0
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val))

    cat_oof[val_idx] = model.predict_proba(X_val)[:,1]
    cat_test += model.predict_proba(X_test)[:,1] / N_SPLITS

print("CatBoost Log Loss:", log_loss(y, cat_oof))



--- CatBoost Fold 1 ---

--- CatBoost Fold 2 ---

--- CatBoost Fold 3 ---

--- CatBoost Fold 4 ---

--- CatBoost Fold 5 ---
CatBoost Log Loss: 0.2678204280330791


In [19]:
final_oof = (0.4 * lgb_oof) + (0.3 * xgb_oof) + (0.3 * cat_oof)
print("Final Ensemble LogLoss:", log_loss(y, final_oof))

final_test = (0.4 * lgb_test) + (0.3 * xgb_test) + (0.3 * cat_test)

Final Ensemble LogLoss: 0.26802612050097196


In [20]:
submission = pd.DataFrame({
    ID_COL: test[ID_COL],
    TARGET: final_test
})

submission.to_csv('submission.csv', index=False)

submission.head()

Unnamed: 0,id,Heart Disease
0,630000,0.937078
1,630001,0.008342
2,630002,0.982876
3,630003,0.005371
4,630004,0.192711
