In [43]:
import sys
import pandas as pd
sys.path.append("./src/")
from problem import Problem

We need to determine the best configuration for each task:

In [39]:
def get_best_configurations(experiment_data):
    best_configurations = experiment_data.sort_values(by="target", ascending=False)\
                                         .groupby("task_id", as_index=False)\
                                         .head(1)
    best_configurations.set_index('task_id', inplace=True)
    return best_configurations

And scale the meta-data so that the neighbour distance isn't dominated by e.g. `n` or `p`:

In [41]:
def min_max_scaler(column):
    return (column - min(column)) / (max(column) - min(column))

And finally be able to look up the closest datasets based on the metadata, and return the best recorded configuration

In [42]:
def L1_distance(row_one, row_two):
    return sum(abs(row_one - row_two))

def find_nearest_neighbour(task, metadata, distance='L1'):
    task_row = metadata.loc[task]
    distances = metadata[metadata.index != task].apply(lambda r: L1_distance(r, task_row), axis=1)
    return distances.idxmin()
    
def find_best_experiment_by_nn(task, metadata, experimentdata):
    closest_task = find_nearest_neighbour(task, metadata, distance='L1')
    return experimentdata.loc[closest_task]
    

In [44]:
best_configurations

Unnamed: 0_level_0,dataset,learner,perf.mmce,target,traintime,predicttime,kernel,cost,gamma,tolerance,shrinking,num.impute.selected.cpo,fitted,degree
task_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
24.0,mushroom,classif.svm.radial,0.000000,-0.000169,1543.771,8.679,radial,2259.39000,1.238730e+00,0.005877,FALSE,impute.hist,FALSE,
10093.0,banknote.authentication,classif.svm,0.000000,-0.001885,2.862,0.288,radial,1.49997,3.442950e+00,0.001630,TRUE,impute.hist,FALSE,
3493.0,monks.problems.2,classif.svm.radial,0.000000,-0.002833,9.437,0.332,radial,76.00760,1.234450e-01,0.000490,FALSE,impute.mean,FALSE,
49.0,tic.tac.toe,classif.svm.radial,0.000000,-0.003622,5.467,0.351,radial,2233.58000,1.203810e-02,0.001001,FALSE,impute.hist,FALSE,
146212.0,shuttle,classif.svm.radial,0.001207,-0.004308,289.184,2.247,radial,75.00980,4.653870e-01,0.063778,FALSE,impute.median,FALSE,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10090.0,amazon.commerce.reviews,classif.svm.radial,0.189333,-1.209355,3560.513,110.128,radial,387.48300,7.598970e-07,0.002526,TRUE,impute.median,FALSE,
3560.0,analcatdata_dmft,classif.svm,0.785396,-1.755373,7.389,0.370,radial,1580.76000,4.210380e-03,0.002229,TRUE,impute.mean,FALSE,
9956.0,one.hundred.plants.texture,classif.svm,0.144454,-1.807794,28.003,2.028,radial,186.77300,9.059910e-04,0.000997,TRUE,impute.median,FALSE,
189924.0,video.game.sales,classif.svm.radial,0.723160,-2.110253,1024.388,19.507,radial,65.81070,1.340770e-01,0.665741,TRUE,impute.mean,FALSE,


In [48]:
problem = Problem("mlr_svm")
normalized_metadata = problem.metadata.apply(min_max_scaler, axis=0)
best_configurations = get_best_configurations(problem.data)
hyperparameters = [
    c for c in best_configurations.columns
    if not c in ['dataset', 'learner', 'perf.mmce', 'target', 'traintime', 'predicttime']
]

recommended_configurations = dict()
for task in normalized_metadata.index:
    best_experiment = find_best_experiment_by_nn(task, normalized_metadata, best_configurations)
    configuration = best_experiment[hyperparameters]
    recommended_configurations[task] = configuration
    


In [52]:
pd.DataFrame.from_dict(recommended_configurations, orient='index').to_csv("mlr_svm_nn.csv")