In [1]:
!pip install xgboost lightgbm tensorflow scikit-learn pandas numpy matplotlib seaborn joblib




In [2]:
from google.colab import files
uploaded = files.upload()


Saving CVD_Cleaned.txt to CVD_Cleaned.txt


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

print("üìÇ Loading cleaned dataset...")

df = pd.read_csv("CVD_Cleaned.txt")
print(f"Loaded: {df.shape}")

# Detect target column
target_col = None
for col in df.columns:
    if "heart" in col.lower() and "disease" in col.lower():
        target_col = col
        break

print("üéØ Target column:", target_col)

# Encode target Yes/No ‚Üí 1/0
df[target_col] = df[target_col].map({"No": 0, "Yes": 1})

X = df.drop(columns=[target_col])
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


üìÇ Loading cleaned dataset...
Loaded: (567606, 28)
üéØ Target column: Heart_Disease


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.base import BaseEstimator, ClassifierMixin

class TabularMLP(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        input_dim=None,
        hidden_units=128,
        hidden_layers=2,
        dropout=0.3,
        batch_size=2048,
        epochs=20,
        verbose=0
    ):
        self.input_dim = input_dim
        self.hidden_units = hidden_units
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.batch_size = batch_size
        self.epochs = epochs
        self.verbose = verbose
        self.model_ = None

    def _build_model(self):
        model = Sequential()
        model.add(Dense(self.hidden_units, activation='relu',
                        input_shape=(self.input_dim,)))
        model.add(Dropout(self.dropout))

        for _ in range(self.hidden_layers - 1):
            model.add(Dense(self.hidden_units, activation='relu'))
            model.add(Dropout(self.dropout))

        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

    def fit(self, X, y):
        self.model_ = self._build_model()
        es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        self.model_.fit(
            X, y,
            validation_split=0.1,
            epochs=self.epochs,
            batch_size=self.batch_size,
            callbacks=[es],
            verbose=self.verbose
        )
        return self

    def predict_proba(self, X):
        prob = self.model_.predict(X, verbose=0).flatten()
        return np.vstack([1 - prob, prob]).T

    def predict(self, X):
        return (self.predict_proba(X)[:, 1] >= 0.5).astype(int)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

RANDOM_STATE = 42

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=100),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss"
    ),

    "LightGBM": LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31
    ),

    "Neural Network MLP": TabularMLP(
        input_dim=X_train.shape[1],
        hidden_units=128,
        hidden_layers=2,
        dropout=0.3,
        epochs=20,
        batch_size=2048,
        verbose=1
    )
}

models


{'Logistic Regression': LogisticRegression(max_iter=1000),
 'Decision Tree': DecisionTreeClassifier(max_depth=10),
 'Random Forest': RandomForestClassifier(),
 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.8, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, feature_weights=None, gamma=None,
               grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.05, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=6, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=300, n_jobs=None,
               num_parallel_tree=None, ...),
 'LightGBM': LGBMClassifier(learning_rate=

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

results = {}

for name, model in models.items():
    print(f"\nüöÄ Training {name}...")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)

    results[name] = [acc, prec, rec, f1, roc]

    print(f"   Accuracy  : {acc:.4f}")
    print(f"   Precision : {prec:.4f}")
    print(f"   Recall    : {rec:.4f}")
    print(f"   F1-Score  : {f1:.4f}")
    print(f"   ROC-AUC   : {roc:.4f}")



üöÄ Training Logistic Regression...
   Accuracy  : 0.8014
   Precision : 0.7944
   Recall    : 0.8134
   F1-Score  : 0.8038
   ROC-AUC   : 0.8874

üöÄ Training Decision Tree...
   Accuracy  : 0.8777
   Precision : 0.8985
   Recall    : 0.8516
   F1-Score  : 0.8744
   ROC-AUC   : 0.9518

üöÄ Training Random Forest...
   Accuracy  : 0.9589
   Precision : 0.9699
   Recall    : 0.9471
   F1-Score  : 0.9584
   ROC-AUC   : 0.9923

üöÄ Training XGBoost...
   Accuracy  : 0.9497
   Precision : 0.9884
   Recall    : 0.9101
   F1-Score  : 0.9476
   ROC-AUC   : 0.9845

üöÄ Training LightGBM...
[LightGBM] [Info] Number of positive: 227042, number of negative: 227042
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2601
[LightGBM] [Info] Number of data points in the train set: 454084, number



   Accuracy  : 0.9545
   Precision : 0.9943
   Recall    : 0.9142
   F1-Score  : 0.9526
   ROC-AUC   : 0.9857

üöÄ Training Neural Network MLP...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m200/200[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.7216 - loss: 0.5381 - val_accuracy: 0.7968 - val_loss: 0.4413
Epoch 2/20
[1m200/200[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.7910 - loss: 0.4475 - val_accuracy: 0.8066 - val_loss: 0.4191
Epoch 3/20
[1m200/200[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8005 - loss: 0.4273 - val_accuracy: 0.8099 - val_loss: 0.4096
Epoch 4/20
[1m200/200[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.8027 - loss: 0.4192 - val_accuracy: 0.8100 - val_loss: 0.4059
Epoch 5/20
[1m200/200[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8065 - loss: 0.4130 - val_accuracy: 0.8125 - va

In [7]:
import pandas as pd

results_df = pd.DataFrame(results, index=["Accuracy", "Precision", "Recall", "F1", "ROC-AUC"]).T
results_df


Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC
Logistic Regression,0.801404,0.794357,0.813375,0.803753,0.887383
Decision Tree,0.877724,0.898513,0.851641,0.874449,0.951826
Random Forest,0.958863,0.969923,0.947094,0.958373,0.992297
XGBoost,0.949675,0.988367,0.910061,0.947599,0.984503
LightGBM,0.954502,0.994309,0.914237,0.952593,0.98567
Neural Network MLP,0.824519,0.813724,0.841722,0.827486,0.907706
