In [1]:
import pandas as pd
import altair as alt

### Comparing AutoML systems

Benchmark results taken from [here.](https://github.com/Innixma/autogluon-benchmarking/blob/fd1f9234a36e7397ae2d8898096366683ba73a41/data/results/input/raw/results_automlbenchmark_1h.csv)

In [2]:
orig = pd.read_csv('resource/automlbenchmark_1h.csv')
orig.head()

Unnamed: 0,acc,auc,duration,fold,framework,id,info,logloss,mode,models,models_ensemble,params,predict_duration,result,seed,tag,task,utc,version
0,0.876766,0.931996,3611.0,0,autogluon,openml.org/t/7592,,,local,22.0,22.0,,66.5838,0.931996,3962431498,stable,adult,2020-02-01T05:58:04,0.0.5
1,0.879222,0.928769,3609.6,1,autogluon,openml.org/t/7592,,,local,22.0,22.0,,125.513,0.928769,3962431499,stable,adult,2020-02-01T07:02:58,0.0.5
2,0.87285,0.932247,3610.1,2,autogluon,openml.org/t/7592,,,local,22.0,22.0,,102.623,0.932247,3962431500,stable,adult,2020-02-01T08:07:26,0.0.5
3,0.869984,0.928272,3610.0,3,autogluon,openml.org/t/7592,,,local,22.0,22.0,,92.4997,0.928272,3962431501,stable,adult,2020-02-01T09:11:48,0.0.5
4,0.873874,0.928196,3610.1,4,autogluon,openml.org/t/7592,,,local,22.0,22.0,,79.5752,0.928196,3962431502,stable,adult,2020-02-01T10:15:56,0.0.5


In [3]:
aggregated = orig.groupby(['framework', 'id']).agg({'acc': 'mean'}).reset_index()
aggregated['acc'] = aggregated['acc'].round(3)
aggregated.head()

Unnamed: 0,framework,id,acc
0,AutoWEKA_benchmark,openml.org/t/10101,0.755
1,AutoWEKA_benchmark,openml.org/t/12,0.977
2,AutoWEKA_benchmark,openml.org/t/146195,0.704
3,AutoWEKA_benchmark,openml.org/t/146212,0.999
4,AutoWEKA_benchmark,openml.org/t/146606,0.607


In [4]:
renamed = aggregated.rename(columns={'id': 'Dataset', 'framework': 'AutoML', 'acc': 'Performance'})
renamed['Dataset'] = renamed['Dataset'].str.replace('openml.org/t/','task_')
renamed['AutoML'].replace('AutoWEKA_benchmark','AutoWEKA', inplace=True)
renamed['AutoML'].replace('autosklearn_benchmark','Auto-Sklearn', inplace=True)
renamed['AutoML'].replace('TPOT_benchmark','TPOT', inplace=True)
renamed['AutoML'].replace('H2OAutoML_benchmark','H2O', inplace=True)
renamed['AutoML'].replace('autogluon','AutoGluon', inplace=True)
renamed.head()

Unnamed: 0,AutoML,Dataset,Performance
0,AutoWEKA,task_10101,0.755
1,AutoWEKA,task_12,0.977
2,AutoWEKA,task_146195,0.704
3,AutoWEKA,task_146212,0.999
4,AutoWEKA,task_146606,0.607


In [5]:
pivoted = renamed.pivot(index='Dataset', columns='AutoML', values='Performance')
pivoted.columns.name = None
pivoted.fillna(0, inplace=True)
pivoted.head()

Unnamed: 0_level_0,Auto-Sklearn,AutoGluon,AutoWEKA,H2O,TPOT
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
task_10101,0.761,0.782,0.755,0.747,0.779
task_12,0.976,0.981,0.977,0.98,0.97
task_146195,0.838,0.869,0.704,0.865,0.86
task_146212,1.0,1.0,0.999,1.0,1.0
task_146606,0.728,0.733,0.607,0.721,0.723


In [6]:
# We added manually the type of classification (binary or multiclass) and the AlphaD3M's scores to the csv file
performances = pd.read_csv('resource/automlbenchmark_1h_average_alphad3m.csv')
#performances = performances.rename(columns={'AlphaD3M': 'SystemX'})
performances.head()

Unnamed: 0,Dataset,Type,Auto-Sklearn,AutoGluon,AutoWEKA,H2O,TPOT,AlphaD3M
0,task_10101,binary,0.761,0.782,0.755,0.747,0.779,0.79
1,task_12,multiclass,0.976,0.981,0.977,0.98,0.97,0.976
2,task_146195,multiclass,0.838,0.869,0.704,0.865,0.86,0.777
3,task_146212,multiclass,1.0,1.0,0.999,1.0,1.0,1.0
4,task_146606,binary,0.728,0.733,0.607,0.721,0.723,0.717


In [7]:
# Change the dataframe to the format of altair lib
performances = pd.melt(performances, id_vars=['Dataset', 'Type'], var_name='AutoML', value_name='Performance')
performances.head()

Unnamed: 0,Dataset,Type,AutoML,Performance
0,task_10101,binary,Auto-Sklearn,0.761
1,task_12,multiclass,Auto-Sklearn,0.976
2,task_146195,multiclass,Auto-Sklearn,0.838
3,task_146212,multiclass,Auto-Sklearn,1.0
4,task_146606,binary,Auto-Sklearn,0.728


In [8]:
def plot_performances(source):
    domain = ['AutoGluon', 'AutoWEKA', 'Auto-Sklearn', 'H2O', 'TPOT', 'AlphaD3M']
    color_range = ['#f7b97c', '#f58517', '#e7ba52', '#e45857', '#d67196', "#396cb0"]
    #color_range = ['#e7cb94', '#e7ba52', '#bd9e3a', '#8c6d31', '#feff99', "#396cb0"]
    #color_range = ['#f58517', '#e7ba52', '#d67196', '#e45857', '#feff99', "#396cb0"]
    
    return alt.Chart(source, title="").mark_point(filled=True, size=32).encode(
        alt.X(
            'Performance:Q',
            title="Accuracy",
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)
        ),
        alt.Y(
            'Dataset:N',
            title="",
            sort='-x',
            axis=alt.Axis(grid=True)
        ),
        #color=alt.Color('AutoML:N', legend=alt.Legend(title="AutoML")),
        color=alt.Color('AutoML:N', scale=alt.Scale(domain=domain, range=color_range), legend=alt.Legend(title="AutoML")),
        row=alt.Row(
            'Type:N',
            title="",
            sort=alt.EncodingSortField(field='yield', op='sum', order='descending'),
        )
    ).properties(
        height=alt.Step(12),
        width=250
    )
# .configure_view(stroke="transparent")

In [9]:
df_binary = performances[(performances['Type']=='binary')]
chart1 = plot_performances(df_binary)

In [10]:
df_multiclass = performances[(performances['Type']=='multiclass')]
chart2 = plot_performances(df_multiclass)

In [11]:
alt.hconcat(chart1, chart2).configure_view(stroke='transparent')