In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef, confusion_matrix, classification_report

import joblib
import os

In [3]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [5]:
X = df.drop("target", axis=1)
y = df["target"]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_prob)
    else:
        auc = "N/A"

    results.append([
        name,
        accuracy_score(y_test, y_pred),
        auc,
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

results_df = pd.DataFrame(
    results,
    columns=["Model","Accuracy","AUC","Precision","Recall","F1","MCC"]
)

results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.795122,0.878736,0.756303,0.873786,0.810811,0.597255
1,Decision Tree,0.985366,0.985437,1.0,0.970874,0.985222,0.971151
2,kNN,0.834146,0.948553,0.8,0.893204,0.844037,0.672727
3,Naive Bayes,0.8,0.87055,0.754098,0.893204,0.817778,0.610224
4,Random Forest,0.985366,1.0,1.0,0.970874,0.985222,0.971151
5,XGBoost,0.985366,0.989435,1.0,0.970874,0.985222,0.971151


In [14]:
os.makedirs("model", exist_ok=True)

for name, model in models.items():
    joblib.dump(model, f"model/{name}.pkl")

joblib.dump(scaler, "model/scaler.pkl")

print("Models saved successfully")

Models saved successfully


In [15]:
!zip -r models.zip model

  adding: model/ (stored 0%)
  adding: model/XGBoost.pkl (deflated 75%)
  adding: model/Naive Bayes.pkl (deflated 21%)
  adding: model/kNN.pkl (deflated 88%)
  adding: model/Logistic Regression.pkl (deflated 31%)
  adding: model/Random Forest.pkl (deflated 81%)
  adding: model/Decision Tree.pkl (deflated 70%)
  adding: model/scaler.pkl (deflated 29%)


In [16]:
test_sample = df.sample(20, random_state=42)

test_sample.to_csv("test_data.csv", index=False)

print("Test CSV created successfully")

Test CSV created successfully
