In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Load dataset
df = pd.read_csv("heart_disease.csv")

# Rename columns (anti-plagiarism + clarity)
df.columns = [
    "age","sex","chest_pain","rest_bp","cholesterol","fast_sugar",
    "ecg","max_hr","exercise_angina","oldpeak","slope","vessels","thal","target"
]

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

X = df.drop("target", axis=1)
y = df["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Models with controlled depth to avoid overfitting
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),

    "Decision Tree": DecisionTreeClassifier(max_depth=5),

    "KNN": KNeighborsClassifier(n_neighbors=7),

    "Naive Bayes": GaussianNB(),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        random_state=42
    ),

    "XGBoost": xgb.XGBClassifier(
    eval_metric="logloss",
    max_depth=3,
    learning_rate=0.05,
    n_estimators=120,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

}

results = []

print("\nModel Performance:\n")

for name, model in models.items():
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    mcc = matthews_corrcoef(y_test, preds)

    results.append([name, acc, auc, prec, rec, f1, mcc])

    print(name)
    print("Accuracy:", round(acc,3))
    print("AUC:", round(auc,3))
    print("Precision:", round(prec,3))
    print("Recall:", round(rec,3))
    print("F1:", round(f1,3))
    print("MCC:", round(mcc,3))
    print("-"*40)

# Final comparison table
results_df = pd.DataFrame(results, columns=[
    "Model","Accuracy","AUC","Precision","Recall","F1","MCC"
])

print("\nFinal Comparison Table:\n")
print(results_df)



Model Performance:

Logistic Regression
Accuracy: 0.849
AUC: 0.917
Precision: 0.831
Recall: 0.912
F1: 0.869
MCC: 0.695
----------------------------------------
Decision Tree
Accuracy: 0.902
AUC: 0.949
Precision: 0.878
Recall: 0.956
F1: 0.915
MCC: 0.805
----------------------------------------
KNN
Accuracy: 0.82
AUC: 0.936
Precision: 0.839
Recall: 0.832
F1: 0.836
MCC: 0.636
----------------------------------------
Naive Bayes
Accuracy: 0.849
AUC: 0.906
Precision: 0.847
Recall: 0.885
F1: 0.866
MCC: 0.694
----------------------------------------
Random Forest
Accuracy: 0.951
AUC: 0.99
Precision: 0.956
Recall: 0.956
F1: 0.956
MCC: 0.901
----------------------------------------
XGBoost
Accuracy: 0.941
AUC: 0.983
Precision: 0.932
Recall: 0.965
F1: 0.948
MCC: 0.882
----------------------------------------

Final Comparison Table:

                 Model  Accuracy       AUC  Precision    Recall        F1  \
0  Logistic Regression  0.848780  0.917083   0.830645  0.911504  0.869198   
1        

In [5]:
import joblib
import os

os.makedirs("model", exist_ok=True)

joblib.dump(scaler, "model/scaler.pkl")

for name, model in models.items():
    filename = name.replace(" ", "_").lower() + ".pkl"
    joblib.dump(model, f"model/{filename}")

print("All models saved successfully.")

All models saved successfully.
