Skip to content

Commit

Permalink
add SMAC optimizer
Browse files Browse the repository at this point in the history
  • Loading branch information
EdenWuyifan committed May 6, 2024
1 parent 98ddbaa commit 1904bf3
Show file tree
Hide file tree
Showing 4 changed files with 434 additions and 5 deletions.
23 changes: 19 additions & 4 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from alpha_automl.automl_manager import AutoMLManager
from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting
from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting, score_pipeline
from alpha_automl.utils import make_d3m_pipelines, hide_logs, get_start_method, check_input_for_multiprocessing, \
setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder, write_pipeline_code_as_pyfile
from alpha_automl.visualization import plot_comparison_pipelines
from alpha_automl.pipeline_serializer import PipelineSerializer
from alpha_automl.hyperparameter_tuning.smac import SmacOptimizer

logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, format='%(levelname)s|%(asctime)s|%(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
Expand All @@ -24,7 +25,7 @@ class BaseAutoML():

def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bound_run=5, task=None,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO):
checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO, optimizing=False):
"""
Create/instantiate an BaseAutoML object.
Expand Down Expand Up @@ -70,6 +71,8 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
self.label_encoder = None
self.task_type = task

self.optimizing = optimizing

def fit(self, X, y):
"""
Search for pipelines and fit the best pipeline.
Expand Down Expand Up @@ -103,10 +106,21 @@ def fit(self, X, y):
sign = get_sign_sorting(self.scorer._score_func, self.score_sorting)
sorted_pipelines = sorted(pipelines, key=lambda x: x.get_score() * sign, reverse=True)

# [SMAC] added here!!
if self.optimizing:
optimizer = SmacOptimizer(X=X, y=y, splitter=self.splitter, scorer=self.scorer, n_trials=200)

leaderboard_data = []
for index, pipeline in enumerate(sorted_pipelines, start=1):
pipeline_id = PIPELINE_PREFIX + str(index)
self.pipelines[pipeline_id] = pipeline
# [SMAC] added here!!
if self.optimizing and index <= 10:
opt_pipeline = optimizer.optimize_pipeline(pipeline.get_pipeline())
opt_score, _, _ = score_pipeline(opt_pipeline, X, y, self.scorer, self.splitter)
logger.critical(f'[SMAC] {pipeline_id} successfully optimized: {pipeline.get_score()} => {opt_score}')
pipeline.set_pipeline(opt_pipeline)
pipeline.set_score(opt_score)
leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])

self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])
Expand Down Expand Up @@ -299,7 +313,7 @@ class ClassifierBaseAutoML(BaseAutoML):

def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5, task=None,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO):
checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO, optimizing=False):
"""
Create/instantiate an AutoMLClassifier object.
Expand All @@ -322,7 +336,8 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo
"""

super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose)
split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode,
verbose, optimizing)

self.label_encoder = LabelEncoder()

Expand Down
168 changes: 168 additions & 0 deletions alpha_automl/hyperparameter_tuning/smac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import json
import logging
from os.path import dirname, join

import numpy as np
from ConfigSpace import (
Categorical,
Configuration,
ConfigurationSpace,
Constant,
Float,
Integer,
)
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from smac import HyperparameterOptimizationFacade, Scenario

from alpha_automl.scorer import make_scorer, make_splitter
from alpha_automl.utils import create_object
from alpha_automl.primitive_loader import PRIMITIVE_TYPES

logger = logging.getLogger(__name__)
SMAC_PARAMETERS_PATH = join(dirname(__file__), 'smac_parameters.json')


def load_smac_parameters():
with open(SMAC_PARAMETERS_PATH) as fin:
primitives = json.load(fin)
logger.info('[SMAC] smac_parameters loaded')

return primitives


SMAC_DICT = load_smac_parameters()


def gen_pipeline(config, pipeline):
new_pipeline = make_pipeline()
for step_name, step_obj in pipeline.steps:
step_type = PRIMITIVE_TYPES[step_name]

if step_type == 'COLUMN_TRANSFORMER':
transformers = []
for trans_name, _, trans_index in step_obj.__dict__['transformers']:
trans_prim_name = trans_name.split('-')[0]
trans_obj = create_object(trans_prim_name, get_primitive_params(config, trans_prim_name))
transformers.append((trans_name, trans_obj, trans_index))
step_obj.__dict__['transformers'] = transformers
new_pipeline.steps.append([step_name, create_object(step_name, step_obj.__dict__)])
else:
new_pipeline.steps.append([step_name, create_object(step_name, get_primitive_params(config, step_name))])

return new_pipeline


def get_primitive_params(config, step_name):
params = list(SMAC_DICT[step_name].keys())
class_params = {}
for param in params:
class_params[param] = config[param]
logger.critical(f'[SMAC] {step_name}: {class_params}')
return class_params


def gen_configspace(pipeline):
# (from build_configspace) Build Configuration Space which defines all parameters and their ranges
configspace = ConfigurationSpace(seed=0)
for primitive, prim_obj in pipeline.steps:
step_type = PRIMITIVE_TYPES[primitive]
try:
params = SMAC_DICT[primitive]
configspace.add_hyperparameters(cast_primitive(params))
if step_type == 'COLUMN_TRANSFORMER':
for trans_name, _, _ in prim_obj.__dict__['transformers']:
trans_prim_name = trans_name.split('-')[0]
params = SMAC_DICT[trans_prim_name]
configspace.add_hyperparameters(cast_primitive(params))
except Exception as e:
logger.critical(f'[SMAC] {str(e)}')
return configspace


def cast_primitive(params):
new_hyperparameters = []
for name, conf in params.items():
config_space = cast_hyperparameter(name, conf)
if config_space is not None:
new_hyperparameters.append(config_space)

return new_hyperparameters


def cast_hyperparameter(param_name, param_conf):
param_type, param_value, param_default = '', '', ''
config_space = None
try:
param_type = param_conf['type']
param_value = param_conf['value']
param_default = param_conf['default']
except Exception as e:
logger.critical(f'[SMAC] {str(e)}')
return
if param_type == 'Categorical':
config_space = Categorical(param_name, param_value, default=param_default)
elif param_type == 'Integer':
min_value = int(param_value[0])
max_value = int(param_value[1])
config_space = Integer(
param_name, (min_value, max_value), default=param_default
)
elif param_type == 'Float':
min_value = float(param_value[0])
max_value = float(param_value[1])
config_space = Float(param_name, (min_value, max_value), default=param_default)
elif param_type == 'Constant':
config_space = Constant(param_name, param_value)
else:
logger.error(f'Unknown param_type {param_type}')

return config_space


class SmacOptimizer:
def __init__(
self,
X=None,
y=None,
n_trials=50,
splitter=make_splitter('holdout'),
scorer=make_scorer('accuracy_score'),
):
self.pipeline = None
self.X = X
self.y = y
self.n_trials = n_trials
self.splitter = splitter
self.scorer = scorer
return

def train(self, config: Configuration, seed: int = 0) -> float:
pipeline = gen_pipeline(config, self.pipeline)
scores = cross_val_score(
pipeline,
self.X,
self.y,
cv=self.splitter,
scoring=self.scorer,
error_score='raise',
)
return 1 - np.mean(scores)

def optimize_pipeline(self, pipeline):
self.pipeline = pipeline
if self.pipeline is None:
logger.critical('[SMAC] get_pipeline return None value!')
return
optimized_conf = self._optimize_pipeline(self.pipeline)
optimized_pipeline = gen_pipeline(optimized_conf, self.pipeline)
logger.debug(f'[SMAC] {pipeline} successfully optimized!')
return optimized_pipeline

def _optimize_pipeline(self, pipeline):
scenario = Scenario(
gen_configspace(pipeline), deterministic=True, n_trials=self.n_trials
)

smac = HyperparameterOptimizationFacade(scenario, self.train)
return smac.optimize()

0 comments on commit 1904bf3

Please sign in to comment.