Notebook determine a symbolic default for each (problem, task) combination from random search results by concatenating the pareto fronts and picking the one with median in-sample performance estimate.

####  Expected file structure.
The working directory of the notebook server should contain:
 - `figures/`
 - `/run/results2/` in which all the `{problem}_{search}` folders are contained

In [1]:
from typing import Tuple, List
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import joblib

# Global settings
sns.set_style('ticks')
memory = joblib.Memory("data/r", verbose = 0)

In [2]:
class Runlog:
    """
    Read logs for a given problem across several searches
    """
    def __init__(self, problem: str, searches: List, logdir: str):
        self.problem = problem
        self.searches = searches
        self.logdir = logdir
    
    @property
    def data(self):
        df = pd.DataFrame([])
        for search in self.searches:
            df = df.append(read_run_logs(problem=self.problem, search=search, target="evaluations", dir=self.logdir))
        return df
    
    @property
    def trace_data(self):
        df = pd.DataFrame([])
        for search in self.searches:
            df = df.append(read_trace_logs(problem=self.problem, search=search, dir=self.logdir))
        return df
    
    def pick_final_expression(self, method = "best", **kwargs):
        """
        Pick final expression on "in" data
        :method: either "relative", "shortest" or "best"
        """
        df = self.data.copy()
        df = df[df['expression'].str.contains(',')]
        df = get_final_paretofront(df)
        df = df.pivot_table(index=["run", "task", "gen", "length", "problem", "search", "expression"], columns="inout", values="score")
        df = df.reset_index()
        if method == "shortest":
            out = df[df.groupby(['run', 'search'])['length'].transform(min) == df['length']]
        elif method == "relative":
            out = df.loc[[pick_relative(group, **kwargs) for name, group in df.groupby(['run', 'search'])]]
        elif method == "cheat":
            out = df[df.groupby(['run', 'search'])['out'].transform(max) == df['out']]
        elif method == "scalarize":
            out = df.loc[[pick_scalarize(group, **kwargs) for name, group in df.groupby(['run', 'search'])]]
        elif method == "shortest_top_n":
            out = df.loc[[pick_shortest_top_n(group, **kwargs) for name, group in df.groupby(['run', 'search'])]]
        else:
            out = df[df.groupby(['run', 'search'])['in'].transform(max) == df['in']]
        return out
    
    def get_benchmark_performances(self):
        """
        Load benchmark performances
        """
        df = self.data[~self.data['expression'].str.contains(',')]
        df = df.pivot_table(index=["run", "task","problem", "search", "expression"], columns="inout", values="score")
        df = df.reset_index()
        df = df[["task", "problem", "expression", "in", "out"]]
        df.drop_duplicates(inplace=True)
        return(df)

### Pick Strategies

def pick_relative(x, eps=0.01, max_steps=1):
    """
    Pick by relative improvement; 
    Consider only at most `max_steps` longer, if better by 'eps', break if not
    """
    x = x.copy()
    if len(x) == 1:
        return(x.index.values[0])
    
    use_ix, length, score = None, 0, 0
    for ix, rw in x.iterrows():
        if rw['length'] - length > max_steps:
            break  # candidates are too big
        if rw['in'] - score < eps:
            continue  # not enough increase
        use_ix, length, score = ix, rw['length'], rw['in']        
    return use_ix

def pick_scalarize(x, b: float):
    """
    Pick by scalarizing length and fitness; 
    Consider only at most `max_steps` longer, if better by 'eps', break if not
    """
    x = x.copy()
    x["scalar_score"] = x["in"] - b * (x["length"] - 1)
    return x["scalar_score"].idxmax()
    
def pick_shortest_top_n(x, n: float):
    """
    Pick by scalarizing length and fitness; 
    Consider only at most `max_steps` longer, if better by 'eps', break if not
    """
    df = x.sort_values("in").idx[:2,]
    return df["length"].idxmin()


@memory.cache()
def read_run_logs(problem:str, search: str, target: str, dir: str):
    """
    Read all log-files for a given problem x search combination
    """
    log_dir = f"{dir}/{problem}_{search}/"
    if os.path.isdir(log_dir):
        dirs = [os.path.join(log_dir, f) for f in os.listdir(log_dir)]
        df = pd.DataFrame([])
        for dir in dirs:
            file = f"{dir}/{target}.csv"
            if os.path.isfile(file):
                tmpdf = pd.read_csv(file, sep=";")
                tmpdf['search'] = search
                # Pivot random search, rename constants only
                if search == "random_search":
                    tmpdf['search'] = tmpdf.apply(lambda x: x['search']+'_'+str((x['gen'] + 1)*100), axis=1)  
                    tmpdf['endresult'] = True
                elif search == "True":
                    tmpdf['search'] = "constants_only"
                df = df.append(tmpdf)
        # Rename rf and add problem columns
        if problem == "rf": 
            df['problem'] = "random forest"
        else:
            if problem == "glmnet": 
                df = df[df["expression"] != "sklearn_default"] # ElasticNet sklearn/glmnet implementations don't match
            df['problem'] = problem
        return df
    else:
        print(log_dir,'is not path')
        


def get_final_paretofront(df: pd.DataFrame):
    """
    Filter pareto front
    """
    return df[df.endresult]
    
def read_trace_logs(problem:str, search: str, dir: str):
    """
    Read "progress" log-files for a given problem x search combination (Optimization Traces)
    """
    log_dir = f"{dir}/{problem}_{search}/"
    if os.path.isdir(log_dir):
        dirs = [os.path.join(log_dir, f) for f in os.listdir(log_dir)]
        df = pd.DataFrame([])
        for dir in dirs:
            file = f"{dir}/progress.csv"
            if os.path.isfile(file):
                tmpdf = pd.read_csv(file, sep=";")
                tmpdf['search'] = search
                if search == "True":
                    tmpdf['search'] = "constants_only"
                elif search == "random_search":
                    tmpdf['generation'] = (tmpdf['generation'] + 1)*100
                df = df.append(tmpdf)

        # Rename and add problem
        if problem == "rf": 
            df['problem'] = "random forest"
        else: 
            df['problem'] = problem
        return df


### Log Collection:
**log** is a dict of all logs in the **logdir**(here `/runs/results2`) folder. 

Most of the time we are only interested in a single individual from the pareto-front per log.
We can obtain this via `pick_final_expression(<strategy>)` from each log.

In [28]:
# Collect logs for all problems
problems = ['svm', 'glmnet', 'rf', 'rpart', 'knn', 'xgboost']
search_strategies = ["random_search"]
log = {}
for problem in problems:
    log[problem] = Runlog(problem, search_strategies, logdir="runs/results2")

### At Random of Best:

In [68]:
for problem in problems:
    f = log[problem].data
    f = f[f.search == "random_search_300"]
    f = f[f.inout == "in"]
    f = f[f["expression"].str.contains(',')]
    f = f[f.groupby("run").score.transform(max) == f.score]

    # sample is for a shuffle
    r = f.sample(frac=1).groupby('task').head(1)

    r[["task", "expression"]].to_csv(f"{problem}_defaults.csv", sep=';', index=False)

### Best of Best 

In [98]:
random_search_3000 = pd.DataFrame()

for problem in problems:
    if problem == "xgboost":
        print("no data yet")
        continue
    # Filter on only random_search_300 expressions first
    s = log[problem].data
    s = s[s.search == "random_search_300"]
    s = s[s["expression"].str.contains(',')] 
    
    # We want to select the best based on in-sample evaluations
    f = s[s.inout == "in"].copy()
    # Best by task (i.e. across runs)
    f = f[f.groupby("task").score.transform(max) == f.score]

    # Multiple runs may have solutions with equivalent performance (expressions may differ)
    # Shuffle the data, then select one at random:
    r = f.sample(frac=1).groupby('task').head(1)
    
    assert len(r) == f.task.nunique(), "You need as many expressions as tasks"
    
    # Save the list of expressions to evaluate on real data
    r[["task", "expression"]].to_csv(f"{problem}_defaults.csv", sep=';', index=False)
    
    # Append to a file with both the in-sample and out-of-sample scores for the picked expression
    # We do this to compare the results to other searches.
    newdf = pd.DataFrame([])
    for task, expression in r[["task", "expression"]].values:
        for idx, row in s.iterrows():
            # We don't verify it's from the same run - but that shouldn't matter
            # The performances should be the same so long as its calculated with the same task (held out)
            if row.task == task and row.expression == expression:
                newdf = newdf.append(row)
    # Again because multiple runs on the same task may lead to the same expression,
    # we filter out duplicates. This time no need to shuffle as they are duplicates.
    best_per_task = newdf.groupby(["task", "expression", "inout"]).head(1)
    random_search_3000 = random_search_3000.append(best_per_task)
    
random_search_3000.replace("random_search_300", "random_search_3000", inplace=True)
random_search_3000.to_csv(f"results/random_search_30k.csv", index=False, sep=';')


no data


### Medians

In [40]:
# not sure how to do this elegantly due to median tie-breakers, so opted to break it down completely:
f = log["svm"].data
f = f[f.search == "random_search_300"]
f = f[f.inout == "in"]
f = f[f["expression"].str.contains(',')]
for task in f.task.unique():
    t = f[f.task == task]
    m = t[t.score == t.score.median()]
    
    if len(m) == 0:
        # Must be an even number of elements in consideration.
        m = t.sort_values(by="score").iloc[int(len(t)/2)-1:int(len(t)/2)+1]
    
    # select one from m
    print(task, m.sample(1).expression.values[0])
        

31 make_tuple(min(n, 18), truediv(1, po))
14965 make_tuple(if_gt(if_gt(p, rc, m, m), expit(n), max(50, p), min(mkd, xvar)), truediv(add(xvar, rc), if_gt(m, m, mkd, po)))
3481 make_tuple(if_gt(add(p, p), add(po, xvar), min(po, m), max(248, rc)), truediv(pow(p, 0.11854840792412273), max(po, po)))
3945 make_tuple(if_gt(rc, 0.4075700691169183, n, po), if_gt(p, mkd, mkd, 0.0160577656570737))
167119 make_tuple(mul(n, 0.0021945151062239816), truediv(mkd, xvar))
168909 make_tuple(if_gt(mkd, mcp, po, p), truediv(xvar, po))
3561 make_tuple(max(truediv(p, xvar), expit(mkd)), truediv(expit(182), max(xvar, po)))
9985 make_tuple(max(p, mkd), truediv(xvar, po))
168868 make_tuple(add(p, xvar), truediv(xvar, po))
189927 make_tuple(sub(m, rc), truediv(mkd, xvar))
9970 make_tuple(if_gt(5, xvar, p, mkd), truediv(xvar, po))
167125 make_tuple(min(n, p), truediv(0.7008622223175714, p))
3512 make_tuple(add(183, p), truediv(0.2479426895779738, po))
9956 make_tuple(add(mkd, p), truediv(xvar, po))
9952 make_tupl