## Analysis of OpenML Experiments

In [1]:
import pandas as pd
import altair as alt

In [2]:
performances = pd.read_csv('resource/openml_results.csv')
performances = performances.round(2)
performances.head()

Unnamed: 0,Dataset,Type,Auto-Sklearn,AutoGluon,AutoWEKA,H2O,TPOT,AlphaD3M
0,task_10101,binary,0.76,0.78,0.76,0.75,0.78,0.79
1,task_12,multiclass,0.98,0.98,0.98,0.98,0.97,0.98
2,task_146195,multiclass,0.84,0.87,0.7,0.86,0.86,0.78
3,task_146212,multiclass,1.0,1.0,1.0,1.0,1.0,1.0
4,task_146606,binary,0.73,0.73,0.61,0.72,0.72,0.72


### Calculating Average Rank

In [3]:
def calculate_rank(performances):
    performances_t = performances.T
    performances_t.columns = performances_t.loc['Dataset'].tolist() # Set the datasets as column names
    all_ranks = []
    
    for dataset in performances_t.columns:
        ranks_by_dataset = pd.DataFrame({dataset: performances_t[dataset]})
        ranks_by_dataset.drop(['Dataset', 'Type'], inplace=True) # Remove 'Dataset', and 'Type'
        ranks_by_dataset = ranks_by_dataset.rank(ascending=False, method='min')
        worst_rank = float(ranks_by_dataset.shape[0]) # Number of AutoML Systems
        ranks_by_dataset.fillna(worst_rank, inplace=True) # Add the worst rank to the systems that didn't produce pipelines
        all_ranks.append(ranks_by_dataset)
        
    all_ranks = pd.concat(all_ranks, axis=1, join='inner')
    
    return all_ranks

In [4]:
ranks = calculate_rank(performances)
ranks

Unnamed: 0,task_10101,task_12,task_146195,task_146212,task_146606,task_146818,task_146821,task_146822,task_146825,task_14965,...,task_31,task_34539,task_3917,task_3945,task_53,task_7592,task_7593,task_9952,task_9977,task_9981
Auto-Sklearn,4.0,1.0,4.0,1.0,1.0,6.0,4.0,3.0,3.0,5.0,...,6.0,1.0,5.0,1.0,5.0,4.0,2.0,4.0,1.0,2.0
AutoGluon,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0
AutoWEKA,4.0,1.0,6.0,1.0,6.0,3.0,6.0,3.0,5.0,5.0,...,5.0,1.0,2.0,1.0,6.0,4.0,6.0,4.0,2.0,6.0
H2O,6.0,1.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,6.0,6.0,3.0,4.0,4.0,3.0,5.0,1.0
TPOT,2.0,6.0,2.0,1.0,3.0,3.0,1.0,3.0,6.0,1.0,...,2.0,1.0,2.0,1.0,2.0,2.0,5.0,1.0,2.0,2.0
AlphaD3M,1.0,1.0,5.0,1.0,3.0,3.0,5.0,3.0,4.0,1.0,...,2.0,1.0,2.0,1.0,3.0,2.0,3.0,4.0,6.0,2.0


In [5]:
ranks['average_rank'] = ranks.mean(axis=1) # Add a column with average rank
ranks = ranks.round({'average_rank': 3})
ranks

Unnamed: 0,task_10101,task_12,task_146195,task_146212,task_146606,task_146818,task_146821,task_146822,task_146825,task_14965,...,task_34539,task_3917,task_3945,task_53,task_7592,task_7593,task_9952,task_9977,task_9981,average_rank
Auto-Sklearn,4.0,1.0,4.0,1.0,1.0,6.0,4.0,3.0,3.0,5.0,...,1.0,5.0,1.0,5.0,4.0,2.0,4.0,1.0,2.0,3.051
AutoGluon,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.231
AutoWEKA,4.0,1.0,6.0,1.0,6.0,3.0,6.0,3.0,5.0,5.0,...,1.0,2.0,1.0,6.0,4.0,6.0,4.0,2.0,6.0,4.282
H2O,6.0,1.0,2.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,...,1.0,6.0,6.0,3.0,4.0,4.0,3.0,5.0,1.0,2.744
TPOT,2.0,6.0,2.0,1.0,3.0,3.0,1.0,3.0,6.0,1.0,...,1.0,2.0,1.0,2.0,2.0,5.0,1.0,2.0,2.0,3.077
AlphaD3M,1.0,1.0,5.0,1.0,3.0,3.0,5.0,3.0,4.0,1.0,...,1.0,2.0,1.0,3.0,2.0,3.0,4.0,6.0,2.0,2.795


### Plotting Results

In [6]:
normalize = True

if normalize:
    systems =  ['Auto-Sklearn', 'AutoGluon', 'AutoWEKA', 'H2O', 'TPOT', 'AlphaD3M']
    performances[systems] = performances[systems].apply(lambda x: x/x.max(), axis=1)

performances.head()

Unnamed: 0,Dataset,Type,Auto-Sklearn,AutoGluon,AutoWEKA,H2O,TPOT,AlphaD3M
0,task_10101,binary,0.962025,0.987342,0.962025,0.949367,0.987342,1.0
1,task_12,multiclass,1.0,1.0,1.0,1.0,0.989796,1.0
2,task_146195,multiclass,0.965517,1.0,0.804598,0.988506,0.988506,0.896552
3,task_146212,multiclass,1.0,1.0,1.0,1.0,1.0,1.0
4,task_146606,binary,1.0,1.0,0.835616,0.986301,0.986301,0.986301


In [7]:
# Change the dataframe to the format of altair lib
performances = pd.melt(performances, id_vars=['Dataset', 'Type'], var_name='AutoML', value_name='Performance')
performances.head()

Unnamed: 0,Dataset,Type,AutoML,Performance
0,task_10101,binary,Auto-Sklearn,0.962025
1,task_12,multiclass,Auto-Sklearn,1.0
2,task_146195,multiclass,Auto-Sklearn,0.965517
3,task_146212,multiclass,Auto-Sklearn,1.0
4,task_146606,binary,Auto-Sklearn,1.0


In [8]:
def plot_performances(source):
    domain = ['AutoGluon', 'AutoWEKA', 'Auto-Sklearn', 'H2O', 'TPOT', 'AlphaD3M']
    color_range = ['#f7b97c', '#f58517', '#e7ba52', '#e45857', '#d67196', "#396cb0"]
    #color_range = ['#e7cb94', '#e7ba52', '#bd9e3a', '#8c6d31', '#feff99', "#396cb0"]
    #color_range = ['#f58517', '#e7ba52', '#d67196', '#e45857', '#feff99', "#396cb0"]
    
    return alt.Chart(source, title="").mark_point(filled=True, size=32).encode(
        alt.X(
            'Performance:Q',
            title="Accuracy",
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)
        ),
        alt.Y(
            'Dataset:N',
            title="",
            sort='-x',
            axis=alt.Axis(grid=True)
        ),
        #color=alt.Color('AutoML:N', legend=alt.Legend(title="AutoML")),
        color=alt.Color('AutoML:N', scale=alt.Scale(domain=domain, range=color_range), legend=alt.Legend(title="AutoML")),
        row=alt.Row(
            'Type:N',
            title="",
            sort=alt.EncodingSortField(field='yield', op='sum', order='descending'),
        )
    ).properties(
        height=alt.Step(12),
        width=250
    )
# .configure_view(stroke="transparent")

In [9]:
df_binary = performances[(performances['Type']=='binary')]
chart1 = plot_performances(df_binary)

In [10]:
df_multiclass = performances[(performances['Type']=='multiclass')]
chart2 = plot_performances(df_multiclass)

In [11]:
alt.hconcat(chart1, chart2).configure_view(stroke='transparent')