In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:
#Load DataSet
df=pd.read_csv("../data/heart.csv")
# print("Data Loaded Successfully",df)

X= df.iloc[:,:-1]
y= df.iloc[:,-1]

#train test split

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [3]:
#Model

models={
    "LogisticRegression":LogisticRegression(),
    "DecisionTree":DecisionTreeClassifier(),
    "KNeighbors":KNeighborsClassifier(),
    "GaussianNB":GaussianNB(),
    "RandomForest":RandomForestClassifier(),
    "XGBoost":XGBClassifier(use_label_encoder=False, eval_metric='logloss')     
    
}

In [4]:
#Metrics FUnctions

def evaluate(model, X_test,y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    return{
        "Accuracy": accuracy_score(y_test, y_pred),
        "ROC_AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1_Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

In [5]:
# ======================
# Train + Save
# ======================
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{name.replace(' ', '_').lower()}.pkl")
    results[name] = evaluate(model, X_test, y_test)

# ======================
# Results Table
# ======================
results_df = pd.DataFrame(results).T
results_df.to_csv("model_results.csv")
print(results_df)

                    Accuracy   ROC_AUC  Precision    Recall  F1_Score  \
LogisticRegression  0.795122  0.878736   0.756303  0.873786  0.810811   
DecisionTree        0.985366  0.985437   1.000000  0.970874  0.985222   
KNeighbors          0.834146  0.948553   0.800000  0.893204  0.844037   
GaussianNB          0.800000  0.870550   0.754098  0.893204  0.817778   
RandomForest        0.985366  1.000000   1.000000  0.970874  0.985222   
XGBoost             0.985366  0.989435   1.000000  0.970874  0.985222   

                         MCC  
LogisticRegression  0.597255  
DecisionTree        0.971151  
KNeighbors          0.672727  
GaussianNB          0.610224  
RandomForest        0.971151  
XGBoost             0.971151  


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
