In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import joblib
import os

# 1. Load data
df = pd.read_csv('../data/heart_disease_clean.csv')
X = df.drop('num', axis=1)
y = df['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid,
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters (GridSearchCV):", grid_search.best_params_)
print("Best accuracy on validation set:", grid_search.best_score_)

# 3. RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring='accuracy', random_state=42, n_jobs=-1, verbose=1)
random_search.fit(X_train, y_train)
print("Best parameters (RandomizedSearchCV):", random_search.best_params_)
print("Best accuracy on validation set:", random_search.best_score_)

# 4. Save best model
best_model = random_search.best_estimator_
os.makedirs('../models', exist_ok=True)
joblib.dump(best_model, '../models/final_model.pkl')
print("✅ Model saved as final_model.pkl")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters (GridSearchCV): {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
Best accuracy on validation set: 0.8223639455782313
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters (RandomizedSearchCV): {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 70}
Best accuracy on validation set: 0.8304421768707483
✅ Model saved as final_model.pkl
