## Step 1: Install and Import Libraries

In [1]:
# Install LightGBM
!pip install lightgbm

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgb



## Step 2: Load Data

In [2]:
train = pd.read_csv("train_heart.csv")
test = pd.read_csv("test_Heart.csv")

print("Train shape : ",train.shape)
print("Test shape : ", test.shape)
print(train.head())
print(test.head())

Train shape :  (630000, 15)
Test shape :  (270000, 14)
   id  Age  Sex  Chest pain type   BP  Cholesterol  FBS over 120  EKG results  \
0   0   58    1                4  152          239             0            0   
1   1   52    1                1  125          325             0            2   
2   2   56    0                2  160          188             0            2   
3   3   44    0                3  134          229             0            2   
4   4   58    1                4  140          234             0            2   

   Max HR  Exercise angina  ST depression  Slope of ST  \
0     158                1            3.6            2   
1     171                0            0.0            1   
2     151                0            0.0            1   
3     150                0            1.0            2   
4     125                1            3.8            2   

   Number of vessels fluro  Thallium Heart Disease  
0                        2         7      Presence  
1  

## Step 3: Separate Features and Target

In [4]:
target = "Heart Disease"
id_col = "id"

X = train.drop([target, id_col], axis=1)
y = train[target]

test_ids = test[id_col]
X_test = test.drop(id_col, axis=1)

## Step 4: Encode Categorical Variables

In [5]:
combined = pd.concat([X, X_test], axis=0)

combined_encoded = pd.get_dummies(combined)

X = combined_encoded.iloc[:len(X)]
X_test = combined_encoded.iloc[len(X):]

## Step 5: Train-Validation Split

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

## Step 6: Train Random Forest

In [7]:
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

rf_val_pred = rf_model.predict_proba(X_val)[:,1]

rf_auc = roc_auc_score(y_val, rf_val_pred)

print("Random Forest ROC-AUC:", rf_auc)

Random Forest ROC-AUC: 0.9528819792130918


## Step 7: Train LightGBM

In [8]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=6,
    random_state=42
)

lgb_model.fit(X_train, y_train)

lgb_val_pred = lgb_model.predict_proba(X_val)[:,1]

lgb_auc = roc_auc_score(y_val, lgb_val_pred)

print("LightGBM ROC-AUC:", lgb_auc)

[LightGBM] [Info] Number of positive: 225963, number of negative: 278037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448339 -> initscore=-0.207383
[LightGBM] [Info] Start training from score -0.207383
LightGBM ROC-AUC: 0.9559739103282345


## Step 8: Compare Models and Select Best

In [10]:
if lgb_auc > rf_auc:
    best_model = lgb_model
    print("Best Model: LightGBM")
else:
    best_model = rf_model
    print("Best Model: Random Forest")

Best Model: LightGBM


## Step 9: Train Best Model on Full Data

In [11]:
best_model.fit(X, y)

[LightGBM] [Info] Number of positive: 282454, number of negative: 347546
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 417
[LightGBM] [Info] Number of data points in the train set: 630000, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448340 -> initscore=-0.207381
[LightGBM] [Info] Start training from score -0.207381


LGBMClassifier(learning_rate=0.05, max_depth=6, n_estimators=1000,
               random_state=42)

## Step 10: Predict Test Data

In [12]:
test_pred = best_model.predict_proba(X_test)[:,1]



## Step 11: Create Submission File

In [13]:
submission = pd.DataFrame({
    "id": test_ids,
    "HeartDisease": test_pred
})

submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")

Submission file created successfully!
