In [1]:
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection

# From OpenML: https://www.openml.org/search?type=data&status=active&id=43672
dataset_name = "Heart-Disease-Dataset-(Comprehensive)"


def get_data_and_scoring_function(dataset_name):
    X, y = sklearn.datasets.fetch_openml(
        dataset_name, as_frame=True, return_X_y=True)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X,
        y,
        random_state=0,
        stratify=y,
    )

    def scoring_function(estimator):
        predictions = estimator.predict_proba(X_test)[:, 1]
        return sklearn.metrics.roc_auc_score(y_test, predictions)

    def train_scoring_function(estimator):
        predictions = estimator.predict_proba(X_train)[:, 1]
        return sklearn.metrics.roc_auc_score(y_train, predictions)

    def get_test_data():
        return X_test, y_test

    return (
        X,
        y,
        X_train,
        y_train,
        get_test_data,
        scoring_function,
        train_scoring_function,
    )


X, y, X_train, y_train, get_test_data, scoring_function, train_scoring_function = (
    get_data_and_scoring_function(dataset_name)
)

X_test, y_test = get_test_data()

print(f"Done Processing and downloading {dataset_name}")

Done Processing and downloading Heart-Disease-Dataset-(Comprehensive)


- version 1, status: active
  url: https://www.openml.org/search?type=data&id=43672
- version 2, status: active
  url: https://www.openml.org/search?type=data&id=43682



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# continuous and categorical features
continuous_features = [
    "age",
    "resting_bp_s",
    "cholesterol",
    "max_heart_rate",
    "oldpeak",
]
categorical_features = [
    "sex",
    "chest_pain_type",
    "fasting_blood_sugar",
    "resting_ecg",
    "exercise_angina",
    "ST_slope",
]

# Preprocessing for continuous features: Standardization
# Preprocessing for categorical features: One-Hot Encoding
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), continuous_features),
        ("cat", OneHotEncoder(), categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [3]:
# cost from neg_log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import log_loss, make_scorer
import numpy as np
import numpy as np
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float, Integer
from ConfigSpace.conditions import EqualsCondition
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import log_loss, make_scorer
import numpy as np


class RandomForestPipeline:

    def __init__(self, seed: int = 0):
        self.seed = seed

    @property
    def configspace(self) -> ConfigurationSpace:
        """
        Configuration Space to optimizer over. In particular, optimize over
        the hyperparameters of Random Forest.
        """
        cs = ConfigurationSpace(seed=self.seed)

        # Hyperparameters
        n_estimators = Integer("n_estimators", (10, 10000), default=100)
        criterion = Categorical(
            "criterion", ["gini", "entropy", "log_loss"], default="gini"
        )
        max_depth = Integer("max_depth", (1, 1000), default=None)
        min_samples_split = Float("min_samples_split", (0.0, 1))
        min_samples_leaf = Integer("min_samples_leaf", (1, 10), default=1)
        min_weight_fraction_leaf = Float(
            "min_weight_fraction_leaf", (0.0, 0.1), default=0.0
        )
        max_features = Float("max_features", (0.0, 1))
        max_leaf_nodes = Integer("max_leaf_nodes", (10, 1000), default=None)
        min_impurity_decrease = Float("min_impurity_decrease", (0.0, 0.02), default=0.0)
        bootstrap = Categorical("bootstrap", [True, False], default=True)
        oob_score = Categorical("oob_score", [True, False], default=False)
        warm_start = Categorical("warm_start", [True, False], default=False)
        class_weight = Categorical(
            "class_weight", ["balanced", "balanced_subsample"], default=None
        )
        ccp_alpha = Float("ccp_alpha", (0.0, 0.001), default=0.0)
        max_samples = Float("max_samples", (0.0, 1.0), default=None)

        cs.add_hyperparameters(
            [
                n_estimators,
                criterion,
                max_depth,
                min_samples_split,
                min_samples_leaf,
                min_weight_fraction_leaf,
                max_features,
                max_leaf_nodes,
                min_impurity_decrease,
                bootstrap,
                oob_score,
                warm_start,
                class_weight,
                ccp_alpha,
                max_samples,
            ]
        )

        # Conditions
        # OOB score only makes sense if bootstrap is True
        oob_score_condition = EqualsCondition(
            child=cs["oob_score"], parent=cs["bootstrap"], value=True
        )
        # "balanced" or "balanced_subsample" class weights is not recommended when warm_start=True
        class_weight_condition = EqualsCondition(
            child=cs["class_weight"], parent=cs["warm_start"], value=False
        )
        # max_features_condition = EqualsCondition(
        #     child=cs["max_features"], parent=cs["criterion"], value="entropy"
        # )
        # Creating the EqualsCondition to link `max_samples` activation with `bootstrap` being True
        max_sample_condition = EqualsCondition(
            child=cs["max_samples"], parent=cs["bootstrap"], value=True
        )

        cs.add_condition(oob_score_condition)
        cs.add_condition(class_weight_condition)
        cs.add_condition(max_sample_condition)

        return cs

    def train(self, config: Configuration, seed: int = 0) -> float:
        """
        Creates a RandomForestClassifier based on a configuration and evaluates it
        on a dataset using cross-validation, with the evaluation metric being the negative log loss.
        """
        config_dict = dict(config)

        np.random.seed(seed=seed)

        # Create a RandomForestClassifier with the specified hyperparameters
        clf = RandomForestClassifier(
            n_jobs=-1,
            # max_samples=budget,
            random_state=seed,
            **config_dict  
        )

        # Evaluate the classifier using cross-validation with negative log loss
        scores = cross_val_score(
            clf,
            X_train,
            y_train,
            cv=5,
            scoring="neg_log_loss",
        )

        # Return the mean of the negative log loss scores
        return -np.mean(scores)

In [4]:
rf = RandomForestPipeline()
cs = rf.configspace
config = cs.sample_configuration(10)

print(config)

[Configuration(values={
  'bootstrap': False,
  'ccp_alpha': 0.0007917250380826646,
  'criterion': 'log_loss',
  'max_depth': 265,
  'max_features': 0.359507900573786,
  'max_leaf_nodes': 575,
  'min_impurity_decrease': 0.0031793916729103942,
  'min_samples_leaf': 10,
  'min_samples_split': 0.317983179393976,
  'min_weight_fraction_leaf': 0.03185689524513237,
  'n_estimators': 6782,
  'warm_start': True,
}), Configuration(values={
  'bootstrap': False,
  'ccp_alpha': 0.0005288949197529045,
  'class_weight': 'balanced_subsample',
  'criterion': 'log_loss',
  'max_depth': 775,
  'max_features': 0.43703195379934145,
  'max_leaf_nodes': 444,
  'min_impurity_decrease': 0.0022075028232861026,
  'min_samples_leaf': 5,
  'min_samples_split': 0.41426299451466997,
  'min_weight_fraction_leaf': 0.06674103799636817,
  'n_estimators': 2707,
  'warm_start': False,
}), Configuration(values={
  'bootstrap': False,
  'ccp_alpha': 0.0005680445610939324,
  'class_weight': 'balanced_subsample',
  'criteri

In [5]:
from smac import HyperparameterOptimizationFacade, Scenario

rf = RandomForestPipeline()

seeds = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900]

for seed in seeds:  # Iterate over specified seeds
    scenario = Scenario(
        rf.configspace,  
        name="RandomForest_HPO_neg_log_loss",
        n_trials=1000,
        objectives=["neg_log_loss"],
        deterministic=True,
        walltime_limit=60 * 60,
        seed=seed,
        n_workers=15,
    )

    smac = HyperparameterOptimizationFacade(
        scenario=scenario,
        target_function=rf.train,  # The function to optimize
        overwrite=True,  
    )
    incumbent = smac.optimize()
    print(f"Seed: {seed}, Incumbent: {incumbent}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 35925 instead


[INFO][abstract_initial_design.py:147] Using 150 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config f50aea as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 02e9c1 and rejected config f50aea as incumbent because it is

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33513 instead


[INFO][abstract_initial_design.py:147] Using 150 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config 964d82 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 52886e and rejected config 964d82 as incumbent because it is

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38385 instead


[INFO][abstract_initial_design.py:147] Using 150 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config bf36ae as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 9ad134 and rejected config bf36ae as incumbent because it is

Perhaps you already have a cluster running?
Hosting the HTTP server on port 32793 instead


[INFO][abstract_initial_design.py:147] Using 150 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config de8ba4 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 9d297e and rejected config de8ba4 as incumbent because it is

2024-02-29 04:46:54,521 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/home/skhani/anaconda3/envs/SMAC2/lib/python3.10/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nosplit_nbytes_bin = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/skhani/anaconda3/envs/SMAC2/lib/python3.10/site-packages/distributed/worker.py", line 1252, in heartbeat
    response = await retry_operation(
  File "/home/skhani/anaconda3/envs/SMAC2/lib/python3.10/site-packages/distributed/utils_comm.py", line 455, in retry_operation
    return await retry(
  File "/home/skhani/anaconda3/envs/SMAC2/lib/python3.10/site-packages/distributed/utils_comm.py", line 434, in retry
    return await coro()
  File "/home/skhani/anaconda3/envs/SMAC2/lib/python3.10/site-pa

[INFO][smbo.py:319] Finished 50 trials.
[INFO][smbo.py:319] Finished 100 trials.
[INFO][abstract_intensifier.py:590] Added config b54269 and rejected config 640ce5 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][smbo.py:319] Finished 150 trials.
[INFO][abstract_intensifier.py:590] Added config 21cd9a and rejected config b54269 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config 77196c and rejected config 21cd9a as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config 896e33 and rejected config 77196c as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config dcbb7d and rejected config 896e33 as incumbent because it is not better than the incumbents on 1 instances:
[INFO][abstract_intensifier.py:590] Added config be01b3 and rejected config dcbb7d as incumbe

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39115 instead


[INFO][abstract_initial_design.py:147] Using 150 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:305] Using only one seed for deterministic scenario.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][smbo.py:319] Finished 0 trials.
[INFO][abstract_intensifier.py:515] Added config b83ccc as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 697687 and rejected config b83ccc as incumbent because it is

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np


def test_score_rf(incumbent, seed):
    """
    Evaluates the test accuracy of the Random Forest model configured with the given hyperparameters
    and provides a classification report.

    :param incumbent: dict, the configuration of hyperparameters for the RandomForestClassifier
    :param budget: int, not used in this simplified version but could be used for controlling model complexity or training time
    :param seed: int, the seed for random operations to ensure reproducibility

    :return: tuple, containing the accuracy of the model on the test set and the classification report
    """

    # Create a RandomForestClassifier with the incumbent configuration
    clf = RandomForestClassifier(**incumbent, random_state=seed)

    # Train the model
    clf.fit(X_train, y_train)

    # Predict on the test set
    predictions = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)

    # Generate classification report
    class_report = classification_report(y_test, predictions)

    # Return accuracy and classification report
    return accuracy, class_report

In [None]:
# Validate the incumbent configuration with a different seed
incumbent_cost = smac.validate(incumbent, seed=1235)
print(f"Incumbent cost: {incumbent_cost}")
print(f"Incumbent accuracy: {1 - incumbent_cost}")

# Evaluate test score and classification report
test_score, classification_report = test_score_rf(incumbent, seed=1235)
print(f"Incumbent test accuracy: {test_score}")
print("Classification Report:\n", classification_report)

In [None]:
print(
    f"What configurations have been executed:\n {smac.runhistory._config_ids} \n\n")
print(
    f"Which config-seed combinations have been executed, did they succeed and what cost did they incur:"
)

# Notice, that we guard against variability in our pipeline with multiple seeds that are averaged over
for key, value in smac.runhistory._data.items():
    print(key, value)