In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, classification_report


data = pd.read_csv("heart.csv")

data.fillna(data.median(numeric_only=True), inplace=True)

le = LabelEncoder()
data['Sex'] = le.fit_transform(data['Sex']) 
data['ChestPainType'] = le.fit_transform(data['ChestPainType'])
data['RestingECG'] = le.fit_transform(data['RestingECG'])
data['ExerciseAngina'] = le.fit_transform(data['ExerciseAngina'])
data['ST_Slope'] = le.fit_transform(data['ST_Slope'])

scaler = StandardScaler()
num_features = ['Age', 'Cholesterol', 'RestingBP',"MaxHR"] 
scaled_features = scaler.fit_transform(data[num_features])
data[num_features] = scaled_features

X = data.drop(columns=["HeartDisease"])
y = data["HeartDisease"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=1000),
}

param_grids = {
    "Logistic Regression": {
        "C": [0.1, 1, 10],
        "penalty": ["l2"],
    },
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [5, 10, 20],
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 5],
    },
    "MLP Classifier": {
        "hidden_layer_sizes": [(20,), (40,), (10,)],
        "activation": ["relu", "tanh",],
        "alpha": [0.0001, 0.001, 0.01],
    },
}

best_models = {
    "model_name": [],
    "model" : [],
    "best_params": [],
    "avg_cross_val_f1 (%)": [],
    "test_f1 (%)": [],
}

for i, (model_name, model) in enumerate(models.items()):
    print(f"Running model : {i+1}")
    grid_search = GridSearchCV(model, param_grids[model_name], scoring="f1", cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_models['model_name'].append(model_name)
    best_models['model'].append(grid_search.best_estimator_)
    best_models['best_params'].append(grid_search.best_params_)

print("performing cross validation ...")
for model_name, model in zip(best_models['model_name'],best_models['model']):
    cv_scores = cross_val_score(model, X_train, y_train, scoring="f1", cv=5)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    best_models['avg_cross_val_f1 (%)'].append(round(cv_scores.mean() * 100, 2))
    best_models['test_f1 (%)'].append(round(f1_score(y_test, y_pred) * 100, 2))

best_models = pd.DataFrame(best_models)
best_models = best_models.sort_values(by = 'avg_cross_val_f1 (%)', ascending=False,ignore_index=True)


final_model = best_models['model'][0]


import joblib

joblib.dump(final_model, "model.pkl")
joblib.dump(scaler, 'scalar.pkl')
print("Model and Scaler Saved!")