tmp

VIDA-NYU · May 14, 2024 · ef6ea6a · ef6ea6a
1 parent 845eac7
commit ef6ea6a
Show file tree

Hide file tree

Showing 3 changed files with 469 additions and 12 deletions.
diff --git a/alpha_automl/automl_api.py b/alpha_automl/automl_api.py
@@ -8,7 +8,7 @@
 from alpha_automl.automl_manager import AutoMLManager
 from alpha_automl.scorer import make_scorer, make_splitter, make_str_metric, get_sign_sorting, score_pipeline
 from alpha_automl.utils import make_d3m_pipelines, hide_logs, get_start_method, check_input_for_multiprocessing, \
-    setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder, write_pipeline_code_as_pyfile
+    setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder, write_pipeline_code_as_pyfile, sample_dataset
 from alpha_automl.visualization import plot_comparison_pipelines
 from alpha_automl.pipeline_serializer import PipelineSerializer
 from alpha_automl.hyperparameter_tuning.smac import SmacOptimizer
@@ -108,18 +108,23 @@ def fit(self, X, y):
 
 
         leaderboard_data = []
+        if self.optimizing:
+            X_sample, y_sample, _ = sample_dataset(X, y, 2000, self.task_type)
+
         for index, pipeline in enumerate(sorted_pipelines, start=1):
             pipeline_id = PIPELINE_PREFIX + str(index)
             self.pipelines[pipeline_id] = pipeline
             # [SMAC] added here!!
             if self.optimizing and index <= 5:
-                optimizer = SmacOptimizer(X=X, y=y, splitter=self.splitter, scorer=self.scorer, n_trials=50)
+                optimizer = SmacOptimizer(X=X_sample, y=y_sample, splitter=self.splitter, scorer=self.scorer, n_trials=100)
                 opt_pipeline = optimizer.optimize_pipeline(pipeline.get_pipeline())
-                alphaautoml_pipeline = score_pipeline(opt_pipeline, X, y, self.scorer, self.splitter, self.task_type)
+                alphaautoml_pipeline = score_pipeline(opt_pipeline, X_sample, y_sample, self.scorer, self.splitter, self.task_type)
+
                 opt_score = alphaautoml_pipeline.get_score()
-                logger.critical(f'[SMAC] {pipeline_id} successfully optimized: {pipeline.get_score()} => {opt_score}')
-                pipeline.set_pipeline(opt_pipeline)
-                pipeline.set_score(opt_score)
+                if opt_score > pipeline.get_score():
+                    logger.critical(f'[SMAC] {pipeline_id} successfully optimized: {pipeline.get_score()} => {opt_score}')
+                    pipeline.set_pipeline(opt_pipeline)
+                    pipeline.set_score(opt_score)
             leaderboard_data.append([index, pipeline.get_summary(), pipeline.get_score()])
 
         self.leaderboard = pd.DataFrame(leaderboard_data, columns=['ranking', 'pipeline', self.metric])

diff --git a/alpha_automl/hyperparameter_tuning/smac.py b/alpha_automl/hyperparameter_tuning/smac.py
@@ -52,6 +52,10 @@ def gen_pipeline(config, pipeline):
             new_pipeline.steps.append([step_name, create_object(step_name, step_obj.__dict__)])
         elif step_type == 'CLASSIFICATION_SINGLE_ENSEMBLER' or step_type == 'REGRESSION_SINGLE_ENSEMBLER':
             estimator = step_obj.estimator
+            estimator_name = estimator.__class__.__name__
+            for smac_name in SMAC_DICT.keys():
+                if estimator_name == smac_name.split(".")[-1]:
+                    estimator = create_object(smac_name, get_primitive_params(config, smac_name))
             primitive_object = create_object(step_name, {'estimator': estimator})
             new_pipeline.steps.append([step_name, primitive_object])
         elif step_type == 'CLASSIFICATION_MULTI_ENSEMBLER' or step_type == 'REGRESSION_MULTI_ENSEMBLER':
@@ -184,7 +188,6 @@ def __init__(
 
     def train(self, config: Configuration, seed: int = 0) -> float:
         self.pipeline = gen_pipeline(config, self.pipeline)
-        logger.critical(f"~!~!~!~!~!~!~!~!~!~!~!~!~!~{self.pipeline}~!~!~!~!~!~!~!~!~!~!~!~!~!~")
         scores = cross_val_score(
             self.pipeline,
             self.X,