###Hyperparameter Tuning

In [None]:
# Cell 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score


In [None]:
df = pd.read_csv("heart_disease_reduced.csv")
print("Shape:", df.shape)

X = df.drop(columns=["target"])
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


# Cell 3: GridSearchCV for Random Forest with Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

rf_params = {
    "rf__n_estimators": [50, 100, 200],
    "rf__max_depth": [None, 5, 10],
    "rf__min_samples_split": [2, 5, 10],
}

rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # included for consistency
    ("rf", RandomForestClassifier(random_state=42, class_weight="balanced"))
])

grid_rf = GridSearchCV(rf_pipeline, rf_params, cv=5, scoring="roc_auc", n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF AUC (CV):", grid_rf.best_score_)

# Evaluate on test set
rf_best = grid_rf.best_estimator_
y_pred_rf = rf_best.predict(X_test)
y_proba_rf = rf_best.predict_proba(X_test)[:, 1]

print("\nRandom Forest Test Results:")
print(classification_report(y_test, y_pred_rf))
print("Test ROC AUC:", roc_auc_score(y_test, y_proba_rf))


Shape: (303, 9)
Train shape: (242, 8) Test shape: (61, 8)
Best RF Params: {'rf__max_depth': 5, 'rf__min_samples_split': 10, 'rf__n_estimators': 50}
Best RF AUC (CV): 0.8786893460806503

Random Forest Test Results:
              precision    recall  f1-score   support

           0       0.93      0.76      0.83        33
           1       0.76      0.93      0.84        28

    accuracy                           0.84        61
   macro avg       0.85      0.84      0.84        61
weighted avg       0.85      0.84      0.84        61

Test ROC AUC: 0.9204545454545454


In [None]:
# Cell 4: RandomizedSearchCV for SVM with Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

svm_params = {
    "svc__C": [0.1, 1, 10, 100],
    "svc__gamma": [0.001, 0.01, 0.1, 1],
    "svc__kernel": ["rbf", "poly", "sigmoid"]
}

svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(probability=True, random_state=42, class_weight="balanced"))
])

rand_svm = RandomizedSearchCV(
    svm_pipeline, svm_params, n_iter=30, cv=5, scoring="roc_auc", random_state=42, n_jobs=-1
)
rand_svm.fit(X_train, y_train)

print("Best SVM Params:", rand_svm.best_params_)
print("Best SVM AUC (CV):", rand_svm.best_score_)

# Evaluate on test set
svm_best = rand_svm.best_estimator_
y_pred_svm = svm_best.predict(X_test)
y_proba_svm = svm_best.predict_proba(X_test)[:, 1]

print("\nSVM Test Results:")
print(classification_report(y_test, y_pred_svm))
print("Test ROC AUC:", roc_auc_score(y_test, y_proba_svm))


Best SVM Params: {'svc__kernel': 'sigmoid', 'svc__gamma': 0.01, 'svc__C': 10}
Best SVM AUC (CV): 0.8949945384727993

SVM Test Results:
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        33
           1       0.81      0.93      0.87        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.88      0.87      0.87        61

Test ROC AUC: 0.9247835497835498


In [None]:
# Cell 5: GridSearchCV for Logistic Regression
lr_params = {
    "logreg__C": [0.01, 0.1, 1, 10],
    "logreg__penalty": ["l1", "l2"]
}

lr_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced"))
])

grid_lr = GridSearchCV(lr_pipe, lr_params, cv=5, scoring="roc_auc", n_jobs=-1)
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
print("Best Logistic Regression AUC (CV):", grid_lr.best_score_)

# Evaluate on test set
lr_best = grid_lr.best_estimator_
y_pred_lr = lr_best.predict(X_test)
y_proba_lr = lr_best.predict_proba(X_test)[:,1]

print("\nLogistic Regression Test Results:")
print(classification_report(y_test, y_pred_lr))
print("Test ROC AUC:", roc_auc_score(y_test, y_proba_lr))


Best Logistic Regression Params: {'logreg__C': 10, 'logreg__penalty': 'l1'}
Best Logistic Regression AUC (CV): 0.8915306915306915

Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.84      0.82      0.83        33
           1       0.79      0.82      0.81        28

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61

Test ROC AUC: 0.9199134199134199


In [None]:
# Cell 5: GridSearchCV for Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

dt_params = {
    "dt__max_depth": [None, 3, 5, 10],
    "dt__min_samples_split": [2, 5, 10],
    "dt__criterion": ["gini", "entropy"]
}

dt_pipeline = Pipeline([
    ("scaler", StandardScaler()),  # scaler included for consistency
    ("dt", DecisionTreeClassifier(random_state=42, class_weight="balanced"))
])

grid_dt = GridSearchCV(dt_pipeline, dt_params, cv=5, scoring="roc_auc", n_jobs=-1)
grid_dt.fit(X_train, y_train)

print("Best DT Params:", grid_dt.best_params_)
print("Best DT AUC (CV):", grid_dt.best_score_)

# Evaluate on test set
dt_best = grid_dt.best_estimator_
y_pred_dt = dt_best.predict(X_test)
y_proba_dt = dt_best.predict_proba(X_test)[:,1]

print("\nDecision Tree Test Results:")
print(classification_report(y_test, y_pred_dt))
print("Test ROC AUC:", roc_auc_score(y_test, y_proba_dt))


Best DT Params: {'dt__criterion': 'gini', 'dt__max_depth': 3, 'dt__min_samples_split': 2}
Best DT AUC (CV): 0.8418645766471855

Decision Tree Test Results:
              precision    recall  f1-score   support

           0       0.80      0.85      0.82        33
           1       0.81      0.75      0.78        28

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.80      0.80      0.80        61

Test ROC AUC: 0.8652597402597402


In [None]:
# Cell 6: Compare Baseline vs Optimized Models

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Collect baseline results (from supervised part 2.4 using baseline_preds dict)
baseline_results = {
    "Random Forest (base)": {
        "accuracy": accuracy_score(y_test, baseline_preds["rf"][0]),
        "f1": f1_score(y_test, baseline_preds["rf"][0]),
        "roc_auc": roc_auc_score(y_test, baseline_preds["rf"][1])
    },
    "SVM (base)": {
        "accuracy": accuracy_score(y_test, baseline_preds["svm"][0]),
        "f1": f1_score(y_test, baseline_preds["svm"][0]),
        "roc_auc": roc_auc_score(y_test, baseline_preds["svm"][1])
    },
    "LogReg (base)": {
        "accuracy": accuracy_score(y_test, baseline_preds["lr"][0]),
        "f1": f1_score(y_test, baseline_preds["lr"][0]),
        "roc_auc": roc_auc_score(y_test, baseline_preds["lr"][1])
    },
    "Decision Tree (base)": {
        "accuracy": accuracy_score(y_test, baseline_preds["dt"][0]),
        "f1": f1_score(y_test, baseline_preds["dt"][0]),
        "roc_auc": roc_auc_score(y_test, baseline_preds["dt"][1])
    }
}

#  Collect optimized results (from hyperparameter tuning)
optimized_results = {
    "Random Forest (opt)": {
        "accuracy": accuracy_score(y_test, rf_best.predict(X_test)),
        "f1": f1_score(y_test, rf_best.predict(X_test)),
        "roc_auc": roc_auc_score(y_test, rf_best.predict_proba(X_test)[:,1])
    },
    "SVM (opt)": {
        "accuracy": accuracy_score(y_test, svm_best.predict(X_test)),
        "f1": f1_score(y_test, svm_best.predict(X_test)),
        "roc_auc": roc_auc_score(y_test, svm_best.predict_proba(X_test)[:,1])
    },
    "LogReg (opt)": {
        "accuracy": accuracy_score(y_test, lr_best.predict(X_test)),
        "f1": f1_score(y_test, lr_best.predict(X_test)),
        "roc_auc": roc_auc_score(y_test, lr_best.predict_proba(X_test)[:,1])
    },
    "Decision Tree (opt)": {
        "accuracy": accuracy_score(y_test, dt_best.predict(X_test)),
        "f1": f1_score(y_test, dt_best.predict(X_test)),
        "roc_auc": roc_auc_score(y_test, dt_best.predict_proba(X_test)[:,1])
    }
}

#  Merge into one table and round
all_results = {**baseline_results, **optimized_results}
results_df = pd.DataFrame(all_results).T.round(3)

#  Compute a balanced score = mean of Accuracy, F1, ROC AUC
results_df["balanced_score"] = results_df[["accuracy", "f1", "roc_auc"]].mean(axis=1)

print("\n=== Model Comparison ===")
print(results_df)

#  Identify best model by balanced score
best_model = results_df["balanced_score"].idxmax()
print("\nBest Model (Balanced Score):", best_model)



=== Model Comparison ===
                      accuracy     f1  roc_auc  balanced_score
Random Forest (base)     0.803  0.800    0.901        0.834667
SVM (base)               0.820  0.820    0.929        0.856333
LogReg (base)            0.820  0.807    0.919        0.848667
Decision Tree (base)     0.770  0.774    0.840        0.794667
Random Forest (opt)      0.836  0.839    0.920        0.865000
SVM (opt)                0.869  0.867    0.925        0.887000
LogReg (opt)             0.820  0.807    0.920        0.849000
Decision Tree (opt)      0.803  0.778    0.865        0.815333

Best Model (Balanced Score): SVM (opt)


The base SVM achieved the highest ROC AUC (0.929), indicating slightly stronger class separation. However, the optimized SVM achieved higher accuracy (0.869) and F1-score (0.862), making it more effective at practical classification. Therefore, we selected the optimized SVM as the final model for deployment.