In [43]:
import sys
import pandas as pd
sys.path.append("./src/")
from problem import Problem

We need to determine the best configuration for each task:

In [39]:
def get_best_configurations(experiment_data):
    best_configurations = experiment_data.sort_values(by="target", ascending=False)\
                                         .groupby("task_id", as_index=False)\
                                         .head(1)
    best_configurations.set_index('task_id', inplace=True)
    return best_configurations

And scale the meta-data so that the neighbour distance isn't dominated by e.g. `n` or `p`:

In [41]:
def min_max_scaler(column):
    return (column - min(column)) / (max(column) - min(column))

And finally be able to look up the closest datasets based on the metadata, and return the best recorded configuration

In [42]:
def L1_distance(row_one, row_two):
    return sum(abs(row_one - row_two))

def find_nearest_neighbour(task, metadata, distance='L1'):
    task_row = metadata.loc[task]
    distances = metadata[metadata.index != task].apply(lambda r: L1_distance(r, task_row), axis=1)
    return distances.idxmin()
    
def find_best_experiment_by_nn(task, metadata, experimentdata):
    closest_task = find_nearest_neighbour(task, metadata, distance='L1')
    return experimentdata.loc[closest_task]   

In [65]:
for algorithm in ["svm", "rf", "glmnet", "rpart", "xgboost", "knn"]:
    print(f"Finding Nearest Neighbour for {algorithm}")
    problem = Problem(f"mlr_{algorithm}")
    normalized_metadata = problem.metadata.apply(min_max_scaler, axis=0)
    best_configurations = get_best_configurations(problem.data)
    filtered_metadata = normalized_metadata[normalized_metadata.index.isin(best_configurations.index)]
    
    hyperparameters = [
        c for c in best_configurations.columns
        if not c in ['dataset', 'learner', 'perf.mmce', 'target', 'traintime', 'predicttime']
    ]

    recommended_configurations = dict()
    for task in best_configurations.index:
        best_experiment = find_best_experiment_by_nn(task, filtered_metadata, best_configurations)
        configuration = best_experiment[hyperparameters]
        recommended_configurations[task] = configuration
    
    pd.DataFrame.from_dict(recommended_configurations, orient='index').to_csv(f"{algorithm}_nearest_neighbors.csv")
    


Finding Nearest Neighbour for svm
Finding Nearest Neighbour for rf
Finding Nearest Neighbour for glmnet
Finding Nearest Neighbour for rpart
Finding Nearest Neighbour for xgboost
Finding Nearest Neighbour for knn


In [61]:
set(normalized_metadata.index) - set(best_configurations.index)

{15,
 29,
 2079,
 3021,
 3493,
 3903,
 3907,
 3918,
 3945,
 9978,
 14971,
 146818,
 146819,
 168912,
 190412}

In [63]:
set(best_configurations.index) - set(normalized_metadata.index)

set()