## Analysis of OpenML Experiments

In [None]:
import pandas as pd
import altair as alt

In [None]:
systems = ['AutoGluon', 'AutoWEKA', 'Auto-Sklearn', 'H2O', 'TPOT', 'AlphaD3M', 'Alpha-AutoML']
performances = pd.read_csv('results/results.csv')
performances = performances[['id', 'framework', 'type', 'result']]
performances = pd.pivot_table(performances, index=['id', 'type'], columns='framework', values='result')
performances = performances.reset_index()
performances.columns.name = None
performances.rename(columns={'id': 'Dataset', 'type': 'Type', 'H2OAutoML': 'H2O', 'autosklearn': 'Auto-Sklearn'}, inplace=True)
performances = performances[['Dataset', 'Type'] + systems]
performances = performances.replace('openml.org/t/','task_', regex=True)
performances.head()

In [None]:
# Utils

def calculate_rank(performances):
    performances_t = performances.T
    performances_t.columns = performances_t.loc['Dataset'].tolist() # Set the datasets as column names
    all_ranks = []
    
    for dataset in performances_t.columns:
        ranks_by_dataset = pd.DataFrame({dataset: performances_t[dataset]})
        ranks_by_dataset.drop(['Dataset', 'Type'], inplace=True) # Remove 'Dataset', and 'Type'
        ranks_by_dataset = ranks_by_dataset.rank(ascending=False, method='min')
        worst_rank = float(ranks_by_dataset.shape[0]) # Number of AutoML Systems
        ranks_by_dataset.fillna(worst_rank, inplace=True) # Add the worst rank to the systems that didn't produce pipelines
        all_ranks.append(ranks_by_dataset)
        
    all_ranks = pd.concat(all_ranks, axis=1, join='inner')
    
    all_ranks['average_rank'] = all_ranks.mean(axis=1) # Add a column with average rank
    all_ranks = all_ranks.round({'average_rank': 2})
    
    return all_ranks

def generate_latex(all_performances, file_name):
    performances = all_performances.copy(deep=True)
    try:
        performances.drop(columns=['Type'], inplace=True)
    except:
        pass
    performances.to_latex(f'{file_name}.tex', float_format='%.2f', index=False, na_rep='-')
    print(f'Latex generated at {file_name}.tex file.')

def calculate_gain(all_performances, target_automl='Alpha-AutoML', worst_score=0):
    performances = all_performances.copy(deep=True)
    other_systems = [s for s in systems if s != target_automl]
    performances['Target_AutoML'] = performances[target_automl].fillna(worst_score)
    performances['Others_AutoML'] = performances[other_systems].fillna(worst_score).mean(axis=1)
    performances['Gain'] = performances['Target_AutoML'] - performances['Others_AutoML']
    performances.drop(columns=['Target_AutoML', 'Others_AutoML'], inplace=True)
    
    return performances.round(2)

def calculate_difference(all_performances, target_automl='Alpha-AutoML', worst_score=0):
    performances = all_performances.copy(deep=True)
    other_systems = [s for s in systems if s != target_automl]
    performances['Target_AutoML'] = performances[target_automl].fillna(worst_score)
    performances['Best_Performance'] = performances[other_systems].max(axis=1)
    performances['Difference'] = performances['Target_AutoML'] - performances['Best_Performance']
    performances.drop(columns=['Target_AutoML', 'Best_Performance'], inplace=True)
    
    return performances.round(2)

### Calculating Average Rank

In [None]:
ranks = calculate_rank(performances)
ranks.sort_values(by='average_rank')

### Calculating Difference with the Best Performance

In [None]:
difference = calculate_difference(performances)
difference.sort_values(by='Difference')

### Calculating Gains

In [None]:
gains = calculate_gain(performances)
gains.head()

In [None]:
round(gains['Gain'].mean(), 2)

In [None]:
generate_latex(gains, 'gains')

### Normalizing Scores

In [None]:
performances[systems] = performances[systems].apply(lambda x: x/x.max(), axis=1)
performances = performances.round(2)
performances.head()

In [None]:
generate_latex(performances, 'normalized_performances')

In [None]:
# Change the dataframe to the format of altair lib
performances = pd.melt(performances, id_vars=['Dataset', 'Type'], var_name='AutoML', value_name='Performance')
performances.head()

In [None]:
def plot_performances(source):
    domain = systems
    color_range = ['#f7b97c', '#f58517', '#e7ba52', '#e45857', '#d67196', '#ccf77c', '#396cb0']
    
    return alt.Chart(source, title="").mark_point(filled=True, size=32).encode(
        alt.X(
            'Performance:Q',
            title="Accuracy",
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)
        ),
        alt.Y(
            'Dataset:N',
            title="",
            sort='-x',
            axis=alt.Axis(grid=True)
        ),
        #color=alt.Color('AutoML:N', legend=alt.Legend(title="AutoML")),
        color=alt.Color('AutoML:N', scale=alt.Scale(domain=domain, range=color_range), legend=alt.Legend(title="AutoML")),
        row=alt.Row(
            'Type:N',
            title="",
            sort=alt.EncodingSortField(field='yield', op='sum', order='descending'),
        )
    ).properties(
        height=alt.Step(12),
        width=250
    )
# .configure_view(stroke="transparent")

In [None]:
df_binary = performances[(performances['Type']=='binary')]
chart1 = plot_performances(df_binary)

In [None]:
df_multiclass = performances[(performances['Type']=='multiclass')]
chart2 = plot_performances(df_multiclass)

In [None]:
alt.hconcat(chart1, chart2).configure_view(stroke='transparent')