In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Load data and columns
df = pd.read_csv('../data/heart_disease_clean.csv')
with open('../models/cols.json', 'r') as f:
    cols = json.load(f)
num_cols = cols['num_cols']
cat_cols = cols['cat_cols']

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

# Define models and hyperparameter grids
models_params = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l2'],
            'classifier__solver': ['lbfgs']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [None, 5, 10, 20],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5],
            'classifier__min_samples_leaf': [1, 2]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['linear', 'rbf'],
            'classifier__gamma': ['scale', 'auto']
        }
    }
}

best_models = {}
for name, mp in models_params.items():
    print(f"Starting GridSearchCV for {name}...")
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', mp['model'])])
    grid = GridSearchCV(pipe, param_grid=mp['params'], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    print(f"Best params for {name}: {grid.best_params_}")
    best_models[name] = grid.best_estimator_

# Evaluate best models on test set
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy after tuning: {acc:.4f}")


Starting GridSearchCV for Logistic Regression...
Best params for Logistic Regression: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Starting GridSearchCV for Decision Tree...
Best params for Decision Tree: {'classifier__max_depth': 5, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 2}
Starting GridSearchCV for Random Forest...
Best params for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Starting GridSearchCV for SVM...
Best params for SVM: {'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
Logistic Regression Test Accuracy after tuning: 0.6230
Decision Tree Test Accuracy after tuning: 0.4918
Random Forest Test Accuracy after tuning: 0.5410
SVM Test Accuracy after tuning: 0.5574


In [2]:
import joblib

# Assuming Random Forest is best — replace with your best
best_model = best_models['Random Forest']

joblib.dump(best_model, '../models/final_model.pkl')
print("Model saved to ../models/final_model.pkl")


Model saved to ../models/final_model.pkl
