## Analysis of OpenML Experiments

In [1]:
import pandas as pd
import altair as alt

In [2]:
performances = pd.read_csv('results/results.csv')
performances = performances[['id', 'framework', 'type', 'result']]
performances = pd.pivot_table(performances, index=['id', 'type'], columns='framework', values='result')
performances = performances.reset_index()
performances.columns.name = None
performances.rename(columns={'id': 'Dataset', 'type': 'Type', 'H2OAutoML': 'H2O', 'autosklearn': 'Auto-Sklearn'}, inplace=True)
performances = performances[['Dataset', 'Type', 'AutoGluon', 'AutoWEKA', 'Auto-Sklearn', 'H2O', 'TPOT', 'AlphaD3M', 'Alpha-AutoML']]
performances = performances.replace('openml.org/t/','task_', regex=True)
performances.head()

Unnamed: 0,Dataset,Type,AutoGluon,AutoWEKA,Auto-Sklearn,H2O,TPOT,AlphaD3M,Alpha-AutoML
0,task_10101,binary,0.76,0.76,0.76,0.76,0.76,0.786667,0.72
1,task_12,multiclass,0.975,0.985,0.98,0.975,,0.965,0.98
2,task_146195,multiclass,0.875518,0.710628,0.855684,0.876702,0.851687,0.805802,0.833333
3,task_146212,multiclass,0.999655,0.998276,1.0,0.999828,0.999828,0.999828,0.999828
4,task_146606,binary,0.736359,0.603774,0.732993,0.719327,,0.726262,0.730546


In [3]:
# Utils

def calculate_rank(performances):
    performances_t = performances.T
    performances_t.columns = performances_t.loc['Dataset'].tolist() # Set the datasets as column names
    all_ranks = []
    
    for dataset in performances_t.columns:
        ranks_by_dataset = pd.DataFrame({dataset: performances_t[dataset]})
        ranks_by_dataset.drop(['Dataset', 'Type'], inplace=True) # Remove 'Dataset', and 'Type'
        ranks_by_dataset = ranks_by_dataset.rank(ascending=False, method='min')
        worst_rank = float(ranks_by_dataset.shape[0]) # Number of AutoML Systems
        ranks_by_dataset.fillna(worst_rank, inplace=True) # Add the worst rank to the systems that didn't produce pipelines
        all_ranks.append(ranks_by_dataset)
        
    all_ranks = pd.concat(all_ranks, axis=1, join='inner')
    
    return all_ranks

def generate_latex(all_performances, file_name):
    performances = all_performances.copy(deep=True)
    try:
        performances.drop(columns=['Type'], inplace=True)
    except:
        pass
    performances.to_latex(f'{file_name}.tex', float_format='%.2f', index=False, na_rep='-')
    print(f'Latex generated at {file_name}.tex file.')

def calculate_gain(all_performances):
    systems =  ['AutoGluon', 'AutoWEKA', 'H2O', 'TPOT', 'Auto-Sklearn', 'Alpha-AutoML']
    performances = all_performances.copy(deep=True)  
    performances['Others_Avg'] = performances[systems].mean(axis=1)
    performances['Gain'] = performances['Alpha-AutoML'] - performances['Others_Avg']
    performances.drop(columns=['Others_Avg'], inplace=True)
    
    return performances.round(2)

### Calculating Gains

In [4]:
gains = calculate_gain(performances)
gains.head()

Unnamed: 0,Dataset,Type,AutoGluon,AutoWEKA,Auto-Sklearn,H2O,TPOT,AlphaD3M,Alpha-AutoML,Gain
0,task_10101,binary,0.76,0.76,0.76,0.76,0.76,0.79,0.72,-0.03
1,task_12,multiclass,0.98,0.98,0.98,0.98,,0.96,0.98,0.0
2,task_146195,multiclass,0.88,0.71,0.86,0.88,0.85,0.81,0.83,-0.0
3,task_146212,multiclass,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,task_146606,binary,0.74,0.6,0.73,0.72,,0.73,0.73,0.03


In [5]:
round(gains['Gain'].mean(), 3)

0.003

In [6]:
generate_latex(gains, 'gains')

Latex generated at gains.tex file.


### Calculating Average Rank

In [7]:
ranks = calculate_rank(performances)
ranks

Unnamed: 0,task_10101,task_12,task_146195,task_146212,task_146606,task_146818,task_146821,task_146822,task_146825,task_14965,...,task_31,task_34539,task_3917,task_3945,task_53,task_7592,task_7593,task_9952,task_9977,task_9981
AutoGluon,2.0,4.0,2.0,6.0,1.0,1.0,6.0,2.0,1.0,4.0,...,4.0,1.0,2.0,4.0,3.0,1.0,1.0,7.0,1.0,3.0
AutoWEKA,2.0,1.0,7.0,7.0,6.0,5.0,1.0,7.0,7.0,7.0,...,6.0,7.0,7.0,7.0,6.0,6.0,5.0,3.0,7.0,7.0
Auto-Sklearn,2.0,2.0,3.0,1.0,2.0,7.0,1.0,2.0,2.0,2.0,...,1.0,3.0,3.0,1.0,4.0,2.0,2.0,5.0,3.0,1.0
H2O,2.0,4.0,1.0,2.0,5.0,2.0,1.0,2.0,3.0,1.0,...,7.0,2.0,7.0,5.0,1.0,7.0,4.0,5.0,1.0,3.0
TPOT,2.0,7.0,4.0,2.0,7.0,3.0,1.0,1.0,7.0,5.0,...,1.0,4.0,1.0,1.0,7.0,4.0,7.0,1.0,3.0,1.0
AlphaD3M,1.0,6.0,6.0,2.0,4.0,3.0,7.0,2.0,4.0,5.0,...,4.0,6.0,3.0,1.0,5.0,5.0,3.0,4.0,6.0,3.0
Alpha-AutoML,7.0,2.0,5.0,2.0,3.0,5.0,1.0,2.0,7.0,3.0,...,3.0,5.0,5.0,7.0,2.0,3.0,7.0,2.0,5.0,6.0


In [8]:
ranks['average_rank'] = ranks.mean(axis=1) # Add a column with average rank
ranks = ranks.round({'average_rank': 2})
ranks

Unnamed: 0,task_10101,task_12,task_146195,task_146212,task_146606,task_146818,task_146821,task_146822,task_146825,task_14965,...,task_34539,task_3917,task_3945,task_53,task_7592,task_7593,task_9952,task_9977,task_9981,average_rank
AutoGluon,2.0,4.0,2.0,6.0,1.0,1.0,6.0,2.0,1.0,4.0,...,1.0,2.0,4.0,3.0,1.0,1.0,7.0,1.0,3.0,2.69
AutoWEKA,2.0,1.0,7.0,7.0,6.0,5.0,1.0,7.0,7.0,7.0,...,7.0,7.0,7.0,6.0,6.0,5.0,3.0,7.0,7.0,5.9
Auto-Sklearn,2.0,2.0,3.0,1.0,2.0,7.0,1.0,2.0,2.0,2.0,...,3.0,3.0,1.0,4.0,2.0,2.0,5.0,3.0,1.0,2.26
H2O,2.0,4.0,1.0,2.0,5.0,2.0,1.0,2.0,3.0,1.0,...,2.0,7.0,5.0,1.0,7.0,4.0,5.0,1.0,3.0,3.49
TPOT,2.0,7.0,4.0,2.0,7.0,3.0,1.0,1.0,7.0,5.0,...,4.0,1.0,1.0,7.0,4.0,7.0,1.0,3.0,1.0,4.21
AlphaD3M,1.0,6.0,6.0,2.0,4.0,3.0,7.0,2.0,4.0,5.0,...,6.0,3.0,1.0,5.0,5.0,3.0,4.0,6.0,3.0,4.21
Alpha-AutoML,7.0,2.0,5.0,2.0,3.0,5.0,1.0,2.0,7.0,3.0,...,5.0,5.0,7.0,2.0,3.0,7.0,2.0,5.0,6.0,4.03


### Normalizing Scores

In [9]:
systems =  ['AutoGluon', 'AutoWEKA', 'H2O', 'TPOT', 'AlphaD3M', 'Auto-Sklearn', 'Alpha-AutoML']
performances[systems] = performances[systems].apply(lambda x: x/x.max(), axis=1)
performances = performances.round(2)
    
performances#.head()

Unnamed: 0,Dataset,Type,AutoGluon,AutoWEKA,Auto-Sklearn,H2O,TPOT,AlphaD3M,Alpha-AutoML
0,task_10101,binary,0.97,0.97,0.97,0.97,0.97,1.0,0.92
1,task_12,multiclass,0.99,1.0,0.99,0.99,,0.98,0.99
2,task_146195,multiclass,1.0,0.81,0.98,1.0,0.97,0.92,0.95
3,task_146212,multiclass,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,task_146606,binary,1.0,0.82,1.0,0.98,,0.99,0.99
5,task_146818,binary,1.0,0.94,0.92,0.98,0.95,0.95,0.94
6,task_146821,multiclass,0.99,1.0,1.0,1.0,1.0,0.97,1.0
7,task_146822,multiclass,1.0,0.99,1.0,1.0,1.0,1.0,1.0
8,task_146825,multiclass,1.0,,0.99,0.99,,0.94,
9,task_14965,binary,1.0,0.96,1.0,1.0,1.0,1.0,1.0


In [10]:
generate_latex(performances, 'normalized_performances')

Latex generated at normalized_performances.tex file.


In [11]:
# Change the dataframe to the format of altair lib
performances = pd.melt(performances, id_vars=['Dataset', 'Type'], var_name='AutoML', value_name='Performance')
performances.head()

Unnamed: 0,Dataset,Type,AutoML,Performance
0,task_10101,binary,AutoGluon,0.97
1,task_12,multiclass,AutoGluon,0.99
2,task_146195,multiclass,AutoGluon,1.0
3,task_146212,multiclass,AutoGluon,1.0
4,task_146606,binary,AutoGluon,1.0


In [12]:
def plot_performances(source):
    domain = ['AutoGluon', 'AutoWEKA', 'Auto-Sklearn', 'H2O', 'TPOT', 'AlphaD3M', 'Alpha-AutoML']
    color_range = ['#f7b97c', '#f58517', '#e7ba52', '#e45857', '#d67196', '#ccf77c', '#396cb0']
    
    return alt.Chart(source, title="").mark_point(filled=True, size=32).encode(
        alt.X(
            'Performance:Q',
            title="Accuracy",
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)
        ),
        alt.Y(
            'Dataset:N',
            title="",
            sort='-x',
            axis=alt.Axis(grid=True)
        ),
        #color=alt.Color('AutoML:N', legend=alt.Legend(title="AutoML")),
        color=alt.Color('AutoML:N', scale=alt.Scale(domain=domain, range=color_range), legend=alt.Legend(title="AutoML")),
        row=alt.Row(
            'Type:N',
            title="",
            sort=alt.EncodingSortField(field='yield', op='sum', order='descending'),
        )
    ).properties(
        height=alt.Step(12),
        width=250
    )
# .configure_view(stroke="transparent")

In [13]:
df_binary = performances[(performances['Type']=='binary')]
chart1 = plot_performances(df_binary)

In [14]:
df_multiclass = performances[(performances['Type']=='multiclass')]
chart2 = plot_performances(df_multiclass)

In [15]:
alt.hconcat(chart1, chart2).configure_view(stroke='transparent')