In [17]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    VotingClassifier
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE


In [18]:
df = pd.read_csv("C:/Users/Lenovo/Datasets/heart_disease.csv")
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [19]:
# ------------------ Data Loading & Preprocessing ------------------

def load_and_preprocess_data(filepath, target):
    df = pd.read_csv(filepath)
    df = df.loc[:, ~df.columns.str.contains('Unnamed')]  # Remove unnamed columns
    df.dropna(inplace=True)  # Drop missing values

    # Encode categorical variables
    for col in df.select_dtypes(include='object').columns:
        df[col] = LabelEncoder().fit_transform(df[col].astype(str))

    # Scale numerical features
    features = df.drop(columns=[target])
    numeric_cols = features.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [20]:
# ------------------ Model Training & Evaluation ------------------

def train_models(X_train, X_test, y_train, y_test):
    classifiers = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "DecisionTree": DecisionTreeClassifier(class_weight="balanced"),
        "RandomForest": RandomForestClassifier(class_weight="balanced"),
        "AdaBoost": AdaBoostClassifier()
    }

    scores = {}
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        acc = accuracy_score(y_test, preds)
        scores[name] = acc
        print(f"\n📊 {name} Accuracy: {acc:.4f}")
        print(classification_report(y_test, preds, zero_division=1))

    return classifiers, scores

In [22]:
# ------------------ Main Pipeline ------------------

def main():
    dataset_path = "C:/Users/Lenovo/Datasets/heart_disease.csv"
    target_column = "num"

    # Load and preprocess data
    df = load_and_preprocess_data(dataset_path, target_column)

    # Check class distribution
    print("⚖️ Class distribution (before SMOTE):", Counter(df[target_column]))

    # Split data
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Apply SMOTE
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
    print("✅ Class distribution (after SMOTE):", Counter(y_train_res))

    # Train and evaluate models
    trained_models, model_scores = train_models(X_train_res, X_test, y_train_res, y_test)

    # Ensemble Voting Classifier
    ensemble = VotingClassifier(estimators=[
        ("LR", LogisticRegression(max_iter=1000)),
        ("DT", DecisionTreeClassifier(class_weight="balanced")),
        ("RF", RandomForestClassifier(class_weight="balanced")),
        ("AB", AdaBoostClassifier())
    ], voting="hard")

    ensemble.fit(X_train_res, y_train_res)
    ensemble_preds = ensemble.predict(X_test)
    ensemble_acc = accuracy_score(y_test, ensemble_preds)
    model_scores["VotingClassifier"] = ensemble_acc

    print(f"\n🔗 Voting Classifier Accuracy: {ensemble_acc:.4f}")
    print(classification_report(y_test, ensemble_preds, zero_division=1))

    # Save best model
    best_model_name = max(model_scores, key=model_scores.get)
    best_model = ensemble if best_model_name == "VotingClassifier" else trained_models[best_model_name]
    joblib.dump(best_model, "heart_disease_model.pkl")

    print(f"\n💾 Best model '{best_model_name}' saved to 'heart_disease_model.pkl'")

if __name__ == "__main__":
    main()

⚖️ Class distribution (before SMOTE): Counter({0: 160, 1: 56, 2: 35, 3: 35, 4: 13})
✅ Class distribution (after SMOTE): Counter({1: 128, 0: 128, 2: 128, 4: 128, 3: 128})

📊 LogisticRegression Accuracy: 0.5833
              precision    recall  f1-score   support

           0       0.87      0.84      0.86        32
           1       0.45      0.45      0.45        11
           2       0.00      0.00      0.00         7
           3       0.43      0.43      0.43         7
           4       0.00      0.00      0.00         3

    accuracy                           0.58        60
   macro avg       0.35      0.35      0.35        60
weighted avg       0.60      0.58      0.59        60


📊 DecisionTree Accuracy: 0.4500
              precision    recall  f1-score   support

           0       0.88      0.72      0.79        32
           1       0.16      0.27      0.20        11
           2       0.00      0.00      0.00         7
           3       0.12      0.14      0.13         