# Hyperparameter Tuning for supervised model


In [10]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the full raw dataset (not the selected one)
df = pd.read_csv("../data/heart_disease.csv")

print("Shape:", df.shape)
df.head()

Shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [12]:
X = df.drop("target", axis=1)
y = df["target"]

print("Features shape:", X.shape)
print("Target distribution:", np.bincount(y))

Features shape: (303, 13)
Target distribution: [164 139]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)

Training set: (242, 13)
Testing set: (61, 13)


In [14]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=12, random_state=42)),
    ("rf", RandomForestClassifier(random_state=42))
])

In [15]:
param_grid = {
    "rf__n_estimators": [100, 200, 300],
    "rf__max_depth": [None, 5, 10],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4]
}

In [16]:
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Grid search complete!")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Grid search complete!
Best parameters: {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5, 'rf__n_estimators': 300}
Best cross-validation score: 0.8139455782312925


In [17]:
best_pipeline = grid.best_estimator_

y_pred = best_pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {acc:.3f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.852

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        33
           1       0.83      0.86      0.84        28

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61

Confusion Matrix:
[[28  5]
 [ 4 24]]


In [19]:
joblib.dump(best_pipeline, "../models/heart_disease_pipeline.pkl")
print("Best pipeline saved as heart_disease_pipeline.pkl")

Best pipeline saved as heart_disease_pipeline.pkl


In [20]:
example = {
    "age": 63, "sex": 1, "cp": 3, "trestbps": 145,
    "chol": 233, "fbs": 1, "restecg": 0, "thalach": 150,
    "exang": 0, "oldpeak": 2.3, "slope": 0, "ca": 0, "thal": 1
}

example_df = pd.DataFrame([example])

pred = best_pipeline.predict(example_df)[0]
proba = best_pipeline.predict_proba(example_df)[0][1]

print("Prediction:", pred)
print("Heart disease probability:", f"{proba:.2%}")

Prediction: 0
Heart disease probability: 27.37%


In [23]:
print(grid.best_estimator_.steps)

[('scaler', StandardScaler()), ('pca', PCA(n_components=12, random_state=42)), ('rf', RandomForestClassifier(min_samples_split=5, n_estimators=300, random_state=42))]
