In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib

In [2]:
df = pd.read_csv("../data/heart_disease_clean.csv")
X = df.drop(columns=['target'])
y = df['target']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [4]:
rf = Pipeline([("scaler", StandardScaler()), ("clf", RandomForestClassifier(random_state=42))])
rf_params = {"clf__n_estimators":[100,200], "clf__max_depth":[None,6,10], "clf__min_samples_split":[2,5]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring="f1", n_jobs=-1)
rf_grid.fit(X_train, y_train)
print("Best RF params:", rf_grid.best_params_)

Best RF params: {'clf__max_depth': None, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}


In [5]:
svm = Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True, random_state=42))])
svm_params = {"clf__C":[0.1,1,10], "clf__kernel":["linear","rbf"], "clf__gamma":["scale","auto"]}
svm_search = RandomizedSearchCV(svm, svm_params, n_iter=5, cv=5, scoring="f1", n_jobs=-1, random_state=42)
svm_search.fit(X_train, y_train)
print("Best SVM params:", svm_search.best_params_)

Best SVM params: {'clf__kernel': 'linear', 'clf__gamma': 'auto', 'clf__C': 10}


In [6]:
best_model = rf_grid.best_estimator_ if rf_grid.best_score_ >= svm_search.best_score_ else svm_search.best_estimator_
joblib.dump(best_model, "../models/final_model.pkl")
print("Best model saved -> models/final_model.pkl")

Best model saved -> models/final_model.pkl
