bugfix: smac

VIDA-NYU · May 14, 2024 · d8505a2 · d8505a2
1 parent dc9b7ad
commit d8505a2
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 43 deletions.
diff --git a/alpha_automl/automl_api.py b/alpha_automl/automl_api.py
@@ -106,18 +106,17 @@ def fit(self, X, y):
         sign = get_sign_sorting(self.scorer._score_func, self.score_sorting)
         sorted_pipelines = sorted(pipelines, key=lambda x: x.get_score() * sign, reverse=True)
 
-        # [SMAC] added here!!
-        if self.optimizing:
-            optimizer = SmacOptimizer(X=X, y=y, splitter=self.splitter, scorer=self.scorer, n_trials=200)
 
         leaderboard_data = []
         for index, pipeline in enumerate(sorted_pipelines, start=1):
             pipeline_id = PIPELINE_PREFIX + str(index)
             self.pipelines[pipeline_id] = pipeline
             # [SMAC] added here!!
-            if self.optimizing and index <= 10:
+            if self.optimizing and index <= 5:
+                optimizer = SmacOptimizer(X=X, y=y, splitter=self.splitter, scorer=self.scorer, n_trials=50)
                 opt_pipeline = optimizer.optimize_pipeline(pipeline.get_pipeline())
-                opt_score, _, _ = score_pipeline(opt_pipeline, X, y, self.scorer, self.splitter)
+                alphaautoml_pipeline = score_pipeline(opt_pipeline, X, y, self.scorer, self.splitter, self.task_type)
+                opt_score = alphaautoml_pipeline.get_score()
                 logger.critical(f'[SMAC] {pipeline_id} successfully optimized: {pipeline.get_score()} => {opt_score}')
                 pipeline.set_pipeline(opt_pipeline)
                 pipeline.set_score(opt_score)

diff --git a/alpha_automl/automl_manager.py b/alpha_automl/automl_manager.py
@@ -67,7 +67,7 @@ def _search_pipelines(self, automl_hyperparams):
 
         found_pipelines = 0
 
-        pipeline_threshold = 20
+        pipeline_threshold = 5
         X, y, _ = sample_dataset(self.X, self.y, SAMPLE_SIZE, self.task)
         while pipelines and found_pipelines < pipeline_threshold:
             pipeline = pipelines.pop()

diff --git a/alpha_automl/hyperparameter_tuning/smac.py b/alpha_automl/hyperparameter_tuning/smac.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import copy
 from os.path import dirname, join
 
 import numpy as np
@@ -10,6 +11,7 @@
     Constant,
     Float,
     Integer,
+
 )
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import make_pipeline
@@ -53,7 +55,8 @@ def gen_pipeline(config, pipeline):
             primitive_object = create_object(step_name, {'estimator': estimator})
             new_pipeline.steps.append([step_name, primitive_object])
         elif step_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or step_type == 'REGRESSION_MULTI_ENSEMBLER':
-            estimators = extract_estimators_smac(step_obj, PRIMITIVE_TYPES)
+            estimators = extract_estimators_smac(step_obj, config)
+            logger.critical(f"[YFW] =========== {config} --- {estimators} ==========")
             primitive_object = create_object(step_name, {'estimators': estimators})
             new_pipeline.steps.append([step_name, primitive_object])
         else:
@@ -64,7 +67,7 @@ def gen_pipeline(config, pipeline):
 
 def extract_estimators_smac(step_obj, config):
     new_estimators = []
-    estimators = step_obj.estimators
+    estimators = copy.deepcopy(step_obj.estimators)
     while estimators:
         estimator_name, estimator_obj = estimators.pop()
         estimator_name_lookup, estimator_name_counter = estimator_name.split('-')
@@ -85,31 +88,41 @@ def get_primitive_params(config, step_name):
 def gen_configspace(pipeline):
     # (from build_configspace) Build Configuration Space which defines all parameters and their ranges
     configspace = ConfigurationSpace(seed=0)
+    all_params = {}
     for primitive, prim_obj in pipeline.steps:
         step_type = PRIMITIVE_TYPES[primitive]
         try:
             params = SMAC_DICT[primitive]
-            configspace.add_hyperparameters(cast_primitive(params))
+            add_params(params, all_params)
             if step_type == 'COLUMN_TRANSFORMER':
                 for trans_name, _, _ in prim_obj.__dict__['transformers']:
                     trans_prim_name = trans_name.split('-')[0]
                     params = SMAC_DICT[trans_prim_name]
-                    configspace.add_hyperparameters(cast_primitive(params))
-            # elif step_type == 'CLASSIFICATION_SINGLE_ENSEMBLER' or step_type == 'REGRESSION_SINGLE_ENSEMBLER':
-            #     estimator_obj = prim_obj.estimator
-            #     for smac_name, smac_params in SMAC_DICT.items():
-            #         if estimator_obj.__class__.__name__ in smac_name:
-            #             configspace.add_hyperparameters(cast_primitive(smac_params))
+                    add_params(params, all_params)
+            elif step_type == 'CLASSIFICATION_SINGLE_ENSEMBLER' or step_type == 'REGRESSION_SINGLE_ENSEMBLER':
+                estimator_obj = prim_obj.estimator
+                for smac_name, params in SMAC_DICT.items():
+                    if estimator_obj.__class__.__name__ == smac_name.split(".")[-1]:
+                        add_params(params, all_params)
             elif step_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or step_type == 'REGRESSION_MULTI_ENSEMBLER':
                 for estimator_name, _ in prim_obj.estimators:
                     estimator_name_lookup, _ = estimator_name.split('-')
                     params = SMAC_DICT[estimator_name_lookup]
-                    configspace.add_hyperparameters(cast_primitive(params))
+                    add_params(params, all_params)
         except Exception as e:
             logger.critical(f'[SMAC] {str(e)}')
+    configspace.add_hyperparameters(cast_primitive(all_params))
     return configspace
 
 
+def add_params(params, all_params):
+    for param_name, param_conf in params.items():
+        if param_name in all_params:
+            pass
+        else:
+            all_params[param_name] = param_conf
+
+
 def cast_primitive(params):
     new_hyperparameters = []
     for name, conf in params.items():
@@ -144,6 +157,8 @@ def cast_hyperparameter(param_name, param_conf):
         config_space = Float(param_name, (min_value, max_value), default=param_default)
     elif param_type == 'Constant':
         config_space = Constant(param_name, param_value)
+    elif param_type == 'Boolean':
+        config_space = Categorical(param_name, param_value, default=param_default)
     else:
         logger.error(f'Unknown param_type {param_type}')
 
@@ -168,31 +183,40 @@ def __init__(
         return
 
     def train(self, config: Configuration, seed: int = 0) -> float:
-        pipeline = gen_pipeline(config, self.pipeline)
+        self.pipeline = gen_pipeline(config, self.pipeline)
+        logger.critical(f"~!~!~!~!~!~!~!~!~!~!~!~!~!~{self.pipeline}~!~!~!~!~!~!~!~!~!~!~!~!~!~")
         scores = cross_val_score(
-            pipeline,
+            self.pipeline,
             self.X,
             self.y,
             cv=self.splitter,
             scoring=self.scorer,
             error_score='raise',
         )
+        logger.critical(f"[WWWWWWWWWWWWWWWW] {self.pipeline} ~~~~~ {scores}")
+
         return 1 - np.mean(scores)
 
     def optimize_pipeline(self, pipeline):
         self.pipeline = pipeline
+        logger.critical(f"????????????????????????????{pipeline}????????????????????????????")
         if self.pipeline is None:
             logger.critical('[SMAC] get_pipeline return None value!')
             return
         optimized_conf = self._optimize_pipeline(self.pipeline)
-        optimized_pipeline = gen_pipeline(optimized_conf, self.pipeline)
-        logger.debug(f'[SMAC] {pipeline} successfully optimized!')
-        return optimized_pipeline
+        logger.critical(f"[YFW] ----------------- {optimized_conf} --- {pipeline}")
+        if optimized_conf:
+            optimized_pipeline = gen_pipeline(optimized_conf, self.pipeline)
+            logger.debug(f'[SMAC] {pipeline} successfully optimized!')
+            return optimized_pipeline
+        else:
+            return self.pipeline
+
 
     def _optimize_pipeline(self, pipeline):
         scenario = Scenario(
             gen_configspace(pipeline), deterministic=True, n_trials=self.n_trials
         )
 
-        smac = HyperparameterOptimizationFacade(scenario, self.train)
+        smac = HyperparameterOptimizationFacade(scenario, self.train, overwrite=True)
         return smac.optimize()
diff --git a/alpha_automl/hyperparameter_tuning/smac_parameters.json b/alpha_automl/hyperparameter_tuning/smac_parameters.json
@@ -27,16 +27,7 @@
             "default": "word"
         }
     },
-    "sklearn.feature_extraction.text.CountVectorizer": {
-        "min_df": {
-            "type": "Float",
-            "value": [
-                0,
-                0.3
-            ],
-            "default": 0.1
-        }
-    },
+    "sklearn.feature_extraction.text.CountVectorizer": {},
     "sklearn.discriminant_analysis.LinearDiscriminantAnalysis": {},
     "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis": {},
     "sklearn.ensemble.BaggingClassifier": {
@@ -135,14 +126,6 @@
                 1024
             ],
             "default": 0.1
-        },
-        "penalty": {
-            "type": "Categorical",
-            "value": [
-                "l2",
-                "l1"
-            ],
-            "default": "l2"
         }
     },
     "sklearn.linear_model.PassiveAggressiveClassifier": {},
@@ -241,14 +224,25 @@
     "sklearn.tree.DecisionTreeClassifier": {},
     "xgboost.XGBClassifier": {},
     "lightgbm.LGBMClassifier": {},
-    "sklearn.ensemble.AdaBoostClassifier": {},
+    "sklearn.ensemble.AdaBoostClassifier": {
+        "algorithm": {
+            "type": "Constant",
+            "value": "SAMME",
+            "default": "SAMME"
+        }
+    },
     "sklearn.ensemble.StackingClassifier": {},
     "sklearn.ensemble.VotingClassifier": {},
     "sklearn.ensemble.AdaBoostRegressor": {},
     "sklearn.ensemble.BaggingRegressor": {},
     "sklearn.ensemble.StackingRegressor": {},
     "sklearn.ensemble.VotingRegressor": {},
     "catboost.CatBoostClassifier": {
+        "logging_level": {
+            "type": "Constant",
+            "value": "Silent",
+            "default": "Silent"
+        },
         "learning_rate": {
             "type": "Float",
             "value": [
@@ -261,7 +255,7 @@
             "type": "Integer",
             "value": [
                 1,
-                16
+                6
             ],
             "default": 6
         },
@@ -290,5 +284,25 @@
             ],
             "default": "None"
         }
+    },
+    "sklearn.impute.SimpleImputer": {
+        "strategy": {
+            "type": "Constant",
+            "value": "most_frequent",
+            "default": "most_frequent"
+        },
+        "keep_empty_features": {
+            "type": "Boolean",
+            "value": [true],
+            "default": true
+        }
+    },
+    "sklearn.preprocessing.OneHotEncoder": {
+        "handle_unknown": {
+            "type": "Constant",
+            "value": "ignore",
+            "default": "ignore"
+        }
     }
+
 }
diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py
@@ -22,7 +22,7 @@ def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_s
     num_cpus = int(ray.available_resources()["CPU"])
 
     # load checkpoint or create a new one
-    algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=1)
+    algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=7)
     logger.debug("Create Algo object done")
 
     # train model