This notebook generates shared files required for the figures in the paper.

In [1]:
import os
import pandas as pd

# surrogate_performance.csv
Contains predicted performance of the generated solutions on the task that was left out during optimization and selection.

Columns:
 - task: the task
 - learner: the learner the expression is found for (e.g. knn, svm)
 - expression: the expression for which the score is predicted
 - score: the normalized score as predicted by the surrogate model for the task
 - optimizer: the optimizer used for finding the expression, one of
     - `Symbolic Default`: obtained with the $\mu$ + $\lambda$ symbolic regression including symbolic terminals.
     - `Constant Default`: obtained with the $\mu$ + $\lambda$ symbolic regression without symbolic terminals.
     - `Random Search X`: obtained with random search but otherwise same as `Symbolic Default`
     - `Package Default`: the scikit-learn or mlr package default.
     - `Optimistic Random Search X`: The best test score on the task among X randomly drawn expressions.
     
Note on the difference of `Random Search` and `Optimistic Random Search`, the `Random Search` is an estimate where random search is employed as optimizer for symbolic expressions. The expression is optimized and selected based on tasks that are *not* the target task. By contrast, `Optimistic Random Search` directly optimizes the configuration on the test task. So `Random Search` finds a *default* whereas `Optimistic Random Search` simulates optimization on the task.

## Generated Default Surrogate Scores

First we load the generated defaults ...

In [42]:
defaults_directory = "../data/generated_defaults"
directory_map = dict(
    # dirname = (optimizer, constants_only)
    symbolic=("mupluslambda", False), 
    constants=("mupluslambda", True), 
    # symbolic=("mu_plus_lambda", False), 
)

generated_defaults = []
for dirname, (optimizer, constants) in directory_map.items():
    for defaults_file in os.listdir(os.path.join(defaults_directory, dirname)):
        if not "mean_rank" in defaults_file:
            continue

        with open(os.path.join(defaults_directory, dirname, defaults_file), "r") as fh:
            lines = fh.readlines()

        for line in lines[1:]:
            learner, task, expression = line[:-1].split(',', 2)
            generated_defaults.append(dict(
                task=task,
                learner=learner,
                optimizer=optimizer,
                constants=constants,
                expression=expression[1:-1],  # expression was exported with quotes
            ))

We could recompile the expressions and query the surrogates to obtain the scores. However this is complicated to do for all algorithms in the same script due to some `DEAP` limitations. For that reason we simply look up the recorded test performance from the run files.

In [43]:
main_directory = "../run"
run_directories = [
    os.path.join(main_directory, subdir, rundir)
    for subdir in os.listdir(main_directory) if os.path.isdir(os.path.join(main_directory, subdir))
    for rundir in os.listdir(os.path.join(main_directory, subdir))
]

In [44]:
runs = []

for run_directory in run_directories:
    with open(os.path.join(run_directory, "metadata.csv"), "r") as fh:
        lines = fh.readlines()
    metadata = dict(line[:-1].split(';') for line in lines[1:])
    if metadata['aggregate'] != 'mean':
        continue
    
    optimizer = metadata['algorithm']
    constants = (metadata['constants_only'] == 'True')
    learner = metadata['problem'][len('mlr_'):]
    
    for default in generated_defaults:
        if 'surrogate_score' in default:
            continue
        
        # run conditions don't matter for the score of the expression on the test set,
        # but we can avoid loading a bunch of `final_pareto` files which likely don't have
        # the expression we are looking for this way.
        different_optimizer = default['optimizer'] != optimizer
        different_constant_constraint = default['constants'] != constants
        different_learner = default['learner'] != learner
        if different_optimizer or different_constant_constraint or different_learner:
            continue
        
        with open(os.path.join(run_directory, "final_pareto.csv"), "r") as fh:
            for line in fh.readlines():
                if default["expression"] in line:
                    _, _, task, score, *_ = line[:-1].split(';')
                    if default["task"] == task:
                        default["surrogate_score"] = score 

In [45]:
missing_records = [d for d in generated_defaults if "surrogate_score" not in d]
print(f"Missing {len(missing_records)} surrogate performance estimates.")

Missing 0 surrogate performance estimates.


In [46]:
surrogate_performance = pd.DataFrame.from_dict(generated_defaults, orient='columns')
surrogate_performance.sample(5)

Unnamed: 0,task,learner,optimizer,constants,expression,surrogate_score
896,145681,rpart,mupluslambda,True,"make_tuple(truediv(0.001027825260946386, 2), 9...",0.8858
1159,3021,xgboost,mupluslambda,True,"make_tuple(332, 0.19258991025238412, 7, 0.0021...",0.9988
809,3917,knn,mupluslambda,True,"make_tuple(add(35, 0.010976039052383521), 53, ...",0.9906
172,9956,knn,mupluslambda,False,"make_tuple(max(mul(if_gt(9, po, 8, mul(xvar, x...",0.978
458,168911,svm,mupluslambda,False,"make_tuple(mul(if_gt(0.5109620985849945, mkd, ...",0.9446


## Implementation Default Surrogate Scores

In [47]:
implementation_default_names = ["sklearn_default", "mlr_default"]
implementation_defaults = []

# we only need implementation performance for (task, learner) pairs which have a generated default
for task, learner in set(zip(surrogate_performance.task, surrogate_performance.learner)):
    for name in implementation_default_names:
        # defaults only recorded for some problems
        if learner not in ["svm", "glmnet", "xgboost"] and name == "sklearn_default":
            continue
        if learner in ["xgboost"] and name == "mlr_default":
            continue
        implementation_defaults.append(dict(
            task=task,
            learner=learner,
            optimizer=name,
            constants=False,
            expression=name,
        ))

In [48]:
for run_directory in run_directories:
    with open(os.path.join(run_directory, "metadata.csv"), "r") as fh:
        lines = fh.readlines()
    metadata = dict(line[:-1].split(';') for line in lines[1:])
    learner = metadata['problem'][len('mlr_'):]
    if metadata['aggregate'] != 'mean':
        continue
    
    for default in implementation_defaults:
        if 'surrogate_score' in default:
            continue
        
        # Since all runs evaluate defaults regardless of optimization,
        # we don't need as strict filtering as above.
        if default['learner'] != learner:
            continue
        
        with open(os.path.join(run_directory, "evaluations.csv"), "r") as fh:
            # implementation defaults are reported last
            for line in fh.readlines()[-100:]:
                if default["expression"] in line:
                    _, _, task, _, _, score, *_ = line[:-1].split(';')
                    if default["task"] == task:
                        default["surrogate_score"] = score 

In [49]:
missing_records = [d for d in implementation_defaults if "surrogate_score" not in d]
print(f"Missing {len(missing_records)} surrogate performance estimates.")

Missing 0 surrogate performance estimates.


In [50]:
default_performance = pd.DataFrame.from_dict(implementation_defaults, orient='columns')
default_performance.sample(5)

Unnamed: 0,task,learner,optimizer,constants,expression,surrogate_score
810,14970,svm,sklearn_default,False,sklearn_default,0.9798
379,189924,xgboost,sklearn_default,False,sklearn_default,0.9919
177,2074,xgboost,sklearn_default,False,sklearn_default,0.9744
753,167211,rf,mlr_default,False,mlr_default,0.9653
100,168338,glmnet,mlr_default,False,mlr_default,0.4382


In [53]:
pd.concat([default_performance, surrogate_performance]).to_csv("surrogate_performance.csv", sep=';', index=False)