In [5]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from supervised_learning import X_train, X_test, y_train, y_test, models, y_test_bin
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import os
from joblib import dump
from pathlib import Path

In [None]:
###Hyperparameter Tuning###

# Define parameter grids for GridSearchCV/RandomizedSearchCV
param_grids = {
    "Logistic Regression": {"C": [0.01, 0.1, 1, 10], "solver": ["lbfgs", "liblinear"], "max_iter": [1000]},
    "Decision Tree": {"max_depth": [None, 5, 10, 15], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]},
    "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5], "min_samples_leaf": [1, 2]},
    "Support Vector Machine": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale", "auto"]}
}

# Store baseline results (from previous classification)
baseline_results={}
for name, model in  models.items():
    y_pred= model.predict(X_test)
    baseline_results[name]= accuracy_score(y_test,y_pred)

# Tune models and compare# Tune models and compare
optimized_models={}
optimized_results={}

for name,model in models.items():
    print(f"\nTuning {name}...")

    grid= GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)

    random= RandomizedSearchCV(model, param_grids[name], cv=3, scoring='accuracy', n_jobs=-1, n_iter=5, random_state=42)
    random.fit(X_train, y_train)

    # Choose the best between grid and random
    best_model= grid.best_estimator_ if grid.best_score_>= random.best_score_ else random.best_estimator_
    optimized_models[name]= best_model

    # Evaluate optimized model
    y_pred_opt= best_model.predict(X_test)
    optimized_results[name]= accuracy_score(y_test,y_pred)

# Compare baseline vs optimized
print("\nBaseline vs Optimized Accuracy:")
for name in models.keys():
     print(f"{name} Basline= {baseline_results[name]:.4f}, Optimized= {optimized_results[name]:.4f}")

# Identify best performing optimized model    
best_model_name = max(optimized_results, key=optimized_results.get)
best_model = optimized_models[best_model_name]
y_pred_best = best_model.predict(X_test)
best_accuracy = accuracy_score(y_test, y_pred_best)

print(f"\nBest Performing Model after Hyperparameter Tuning: {best_model_name}")
print(f"Accuracy of Best Model on Test Set: {best_accuracy:.4f}")

# Save evaluation metrics

# Make sure results folder exists
os.makedirs('results', exist_ok=True)

metrics_lines = []

# Re-run metrics collection for saving
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    auc = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')

    metrics_lines.append(f"{name}:\n"
                         f"Accuracy: {acc:.4f}\n"
                         f"Precision: {prec:.4f}\n"
                         f"Recall: {rec:.4f}\n"
                         f"F1-score: {f1:.4f}\n"
                         f"ROC AUC: {auc:.4f}\n\n")

# Write metrics to file
with open("results/evaluation_metrics.txt", "w") as f:
    f.writelines(metrics_lines)

print("\nMetrics saved to results/evaluation_metrics.txt")




Tuning Logistic Regression...

Tuning Decision Tree...

Tuning Random Forest...

Tuning Support Vector Machine...

Baseline vs Optimized Accuracy:
Logistic Regression Basline= 0.5738, Optimized= 0.5738
Decision Tree Basline= 0.3770, Optimized= 0.5738
Random Forest Basline= 0.4918, Optimized= 0.5738
Support Vector Machine Basline= 0.5738, Optimized= 0.5738

Best Performing Model after Hyperparameter Tuning: Logistic Regression
Accuracy of Best Model on Test Set: 0.5738

Metrics saved to results/evaluation_metrics.txt


In [None]:
###Model Export & Deployment###

# Dynamically find project folder
notebook_folder = Path().resolve()
project_root = notebook_folder / ".." / "Heart_Disease_Project"  # go into project folder
project_root = project_root.resolve()  # normalize path

# Make sure models folder exists
models_folder = project_root / 'models'
models_folder.mkdir(exist_ok=True)

# Create and fit pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', best_model)
])

# Fit pipeline on training data
pipeline.fit(X_train, y_train)

# Save the pipeline to a .pkl file 
pipeline_file = models_folder / 'final_model.pkl'
dump(pipeline, pipeline_file)
print(f"\nModel pipeline saved as '{pipeline_file}'")



Model pipeline saved as 'C:\Users\Tasneem\OneDrive\Desktop\Heart_Disease_Project\models\final_model.pkl'
