This notebook generates shared files required for the figures in the paper.

In [1]:
import os
import pandas as pd

# surrogate_performance.csv
Contains predicted performance of the generated solutions on the task that was left out during optimization and selection.

Columns:
 - task: the task
 - learner: the learner the expression is found for (e.g. knn, svm)
 - expression: the expression for which the score is predicted
 - score: the normalized score as predicted by the surrogate model for the task
 - optimizer: the optimizer used for finding the expression, one of
     - `Symbolic Default`: obtained with the $\mu$ + $\lambda$ symbolic regression including symbolic terminals.
     - `Constant Default`: obtained with the $\mu$ + $\lambda$ symbolic regression without symbolic terminals.
     - `Random Search X`: obtained with random search but otherwise same as `Symbolic Default`
     - `Package Default`: the scikit-learn or mlr package default.
     - `Optimistic Random Search X`: The best test score on the task among X randomly drawn expressions.
     
Note on the difference of `Random Search` and `Optimistic Random Search`, the `Random Search` is an estimate where random search is employed as optimizer for symbolic expressions. The expression is optimized and selected based on tasks that are *not* the target task. By contrast, `Optimistic Random Search` directly optimizes the configuration on the test task. So `Random Search` finds a *default* whereas `Optimistic Random Search` simulates optimization on the task.

## Generated Default Surrogate Scores

First we load the generated defaults ...

In [5]:
defaults_directory = "../data/generated_defaults"
directory_map = dict(
    # dirname = (optimizer, constants_only)
    symbolic=("mupluslambda", False), 
    constants=("mupluslambda", True), 
    # symbolic=("mu_plus_lambda", False), 
)

generated_defaults = []
for dirname, (optimizer, constants) in directory_map.items():
    for defaults_file in os.listdir(os.path.join(defaults_directory, dirname)):
        if not "mean_rank" in defaults_file:
            continue

        with open(os.path.join(defaults_directory, dirname, defaults_file), "r") as fh:
            lines = fh.readlines()

        for line in lines[1:]:
            learner, task, expression = line[:-1].split(',', 2)
            generated_defaults.append(dict(
                task=task,
                learner=learner,
                optimizer=optimizer,
                constants=constants,
                expression=expression[1:-1],  # expression was exported with quotes
            ))

We could recompile the expressions and query the surrogates to obtain the scores. However this is complicated to do for all algorithms in the same script due to some `DEAP` limitations. For that reason we simply look up the recorded test performance from the run files.

In [18]:
main_directory = "../run"
run_directories = [
    os.path.join(main_directory, subdir, rundir)
    for subdir in os.listdir(main_directory) if os.path.isdir(os.path.join(main_directory, subdir))
    for rundir in os.listdir(os.path.join(main_directory, subdir))
]

In [19]:
runs = []

for run_directory in run_directories:
    with open(os.path.join(run_directory, "metadata.csv"), "r") as fh:
        lines = fh.readlines()
    metadata = dict(line[:-1].split(';') for line in lines[1:])
    
    optimizer = metadata['algorithm']
    constants = (metadata['constants_only'] == 'True')
    learner = metadata['problem'][len('mlr_'):]
    
    for default in generated_defaults:
        if 'surrogate_score' in default:
            continue
        
        # run conditions don't matter for the score of the expression on the test set,
        # but we can avoid loading a bunch of `final_pareto` files which likely don't have
        # the expression we are looking for this way.
        different_optimizer = default['optimizer'] != optimizer
        different_constant_constraint = default['constants'] != constants
        different_learner = default['learner'] != learner
        if different_optimizer or different_constant_constraint or different_learner:
            continue
        
        with open(os.path.join(run_directory, "final_pareto.csv"), "r") as fh:
            for line in fh.readlines():
                if default["expression"] in line:
                    _, _, task, score, *_ = line[:-1].split(';')
                    if default["task"] == task:
                        default["surrogate_score"] = score 

In [29]:
surrogate_performance = pd.DataFrame.from_dict(generated_defaults, orient='columns')
surrogate_performance.sample(5)

Unnamed: 0,task,learner,optimizer,constants,expression,surrogate_score
816,168910,knn,mupluslambda,True,"make_tuple(add(0.05338802724338258, 35), 55, t...",0.9986
667,146820,glmnet,mupluslambda,True,"make_tuple(0.6306869733121467, 0.0018349312502...",0.6961
143,14969,knn,mupluslambda,False,"make_tuple(truediv(n, mul(76, if_gt(add(mcp, 0...",0.9995
1070,14954,svm,mupluslambda,True,"make_tuple(add(322, 0.07875742077160208), 0.00...",0.9193
358,168908,rpart,mupluslambda,False,"make_tuple(mul(if_gt(po, n, m, max(0.243160930...",0.987


## Implementation Default Surrogate Scores

{('10090', 'glmnet'),
 ('10090', 'knn'),
 ('10090', 'rf'),
 ('10090', 'rpart'),
 ('10090', 'svm'),
 ('10090', 'xgboost'),
 ('10093', 'glmnet'),
 ('10093', 'knn'),
 ('10093', 'rf'),
 ('10093', 'rpart'),
 ('10093', 'svm'),
 ('10093', 'xgboost'),
 ('10101', 'glmnet'),
 ('10101', 'knn'),
 ('10101', 'rf'),
 ('10101', 'rpart'),
 ('10101', 'svm'),
 ('10101', 'xgboost'),
 ('10106', 'rf'),
 ('10106', 'svm'),
 ('10106', 'xgboost'),
 ('11', 'glmnet'),
 ('11', 'knn'),
 ('11', 'rf'),
 ('11', 'rpart'),
 ('11', 'svm'),
 ('11', 'xgboost'),
 ('12', 'glmnet'),
 ('12', 'knn'),
 ('12', 'rf'),
 ('12', 'rpart'),
 ('12', 'svm'),
 ('12', 'xgboost'),
 ('125920', 'glmnet'),
 ('125920', 'knn'),
 ('125920', 'rpart'),
 ('125920', 'svm'),
 ('125920', 'xgboost'),
 ('125921', 'glmnet'),
 ('125921', 'knn'),
 ('125921', 'rf'),
 ('125921', 'rpart'),
 ('125921', 'svm'),
 ('125921', 'xgboost'),
 ('125922', 'glmnet'),
 ('125922', 'knn'),
 ('125922', 'rf'),
 ('125922', 'rpart'),
 ('125922', 'svm'),
 ('125922', 'xgboost'),
 

In [None]:
implementation_default_names = ["sklearn_default", "mlr_default"]
implementation_defaults = []

for task, learner in set(zip(surrogate_performance.task, surrogate_performance.learner)):
    for name in implementation_default_names:
        implementation_defaults.append(dict(
            task=task,
            learner=learner,
            optimizer=name,
            constants=False,
            expression=name,
        ))


In [None]:
for run_directory in run_directories:
    with open(os.path.join(run_directory, "metadata.csv"), "r") as fh:
        lines = fh.readlines()
    metadata = dict(line[:-1].split(';') for line in lines[1:])
    
    optimizer = metadata['algorithm']
    constants = (metadata['constants_only'] == 'True')
    learner = metadata['problem'][len('mlr_'):]
    
    for default in generated_defaults:
        if 'surrogate_score' in default:
            continue
        
        # run conditions don't matter for the score of the expression on the test set,
        # but we can avoid loading a bunch of `final_pareto` files which likely don't have
        # the expression we are looking for this way.
        different_optimizer = default['optimizer'] != optimizer
        different_constant_constraint = default['constants'] != constants
        different_learner = default['learner'] != learner
        if different_optimizer or different_constant_constraint or different_learner:
            continue
        
        with open(os.path.join(run_directory, "final_pareto.csv"), "r") as fh:
            for line in fh.readlines():
                if default["expression"] in line:
                    _, _, task, score, *_ = line[:-1].split(';')
                    if default["task"] == task:
                        default["surrogate_score"] = score 

In [None]:
surrogate_performance