# Compare metrics among different AutoML frameworks

Notebook to compare the metrics achieved by different AutoML frameworks or setups. It makes a statistical analysis of the results achieved on the different metrics and converts then into latex tables.

In [249]:
import os
import pandas as pd
import numpy as np
from stats import *

In [250]:
import warnings
warnings.filterwarnings('ignore')

In [251]:
source_path = os.path.join('..', 'data', 'results-csv')

In [252]:
datasets = list(sorted(['adult', 'Australian', 'cnae-9', 'credit-g', 'mfeat-factors', 'Amazon_employee_access', 'bank-marketing'])) # 'kr-vs-kp', 'car'

In [253]:
frameworks_map = {
    'edca-1-0-0' : 'EDCA',
    'flaml': 'FLAML',
    'tpot' : 'TPOT',
    # 'edca-1-0-0-all' : 'EDCA-All',
    # 'flaml-all' : 'FLAML-All',
    'flaml-edca-1-0-0' : 'FLAML+EDCA',
    'tpot-edca-1-0-0' : 'TPOT+EDCA',
}

In [254]:
round_digits = 2

In [255]:
number_comparisons = 4

In [256]:
symbol = '±'

In [257]:
metric = 'mcc'

In [258]:
stats_test = statistical_test_repeated

In [259]:
def get_values(datasets, frameworks, metric):
    values = []
    for dataset in datasets:
        aux = {}
        for framework in frameworks:
            if 'all' in framework:
                framework = framework.replace('-all', '')
                train_data = 'all_data_'
            else:
                train_data = ''
            framework_name = framework
            if 'edca' in framework:
                framework_name = 'evo'

            if 'flaml' in framework and 'edca' in framework:
                framework_name = 'flaml_with_edca'

            if 'tpot' in framework and 'edca' in framework:
                framework_name = 'tpot_with_edca'
            metric_name = metric
            if framework == 'tpot':
                train_data = ''
            if metric_name == 'num_pipelines_tested' and framework == 'flaml':
                metric_name = 'num_iterations'
            if metric_name == 'num_pipelines_tested' and framework == 'tpot':
                metric_name = 'total_evaluated'
            try:
                df = pd.read_csv(os.path.join(source_path, dataset, f'{framework}-results.csv'))
                mean = round(df[f'{framework_name}_{train_data}{metric_name}'].mean(), round_digits)
                std = round(df[f'{framework_name}_{train_data}{metric_name}'].std(), round_digits)
                if metric == 'num_pipelines_tested' or metric == 'num_iterations':
                    mean = int(mean)
                    std = int(std)
                if train_data:
                    aux[f'{framework}-all'] = [f'{mean}{symbol}{std}']
                else:
                    aux[framework] = [f'{mean}{symbol}{std}']
            except Exception as e:
                print(e)
                if train_data:
                    aux[f'{framework}-all'] = [np.nan]
                else:
                    aux[framework] = [np.nan]
        values.append(pd.DataFrame(aux))
    df = pd.concat(values)
    df.index = datasets
    return df

In [260]:
df = get_values(datasets, frameworks_map.keys(), metric)
df

No columns to parse from file
No columns to parse from file
No columns to parse from file
No columns to parse from file
No columns to parse from file
No columns to parse from file


Unnamed: 0,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
Amazon_employee_access,0.44±0.01,0.42±0.01,0.46±0.03,0.41±0.02,0.43±0.02
Australian,0.72±0.01,0.72±0.02,0.72±0.02,0.69±0.04,0.64±0.1
adult,0.63±0.0,0.63±0.01,,0.63±0.0,
bank-marketing,0.5±0.01,0.51±0.01,,0.51±0.01,
cnae-9,0.93±0.01,0.93±0.01,0.94±0.01,0.91±0.01,0.92±0.03
credit-g,0.32±0.04,0.34±0.03,,0.24±0.07,
mfeat-factors,0.97±0.0,0.96±0.0,0.97±0.0,0.95±0.0,0.96±0.01


In [261]:
main = 'edca-1-0-0'
main_metric = f'evo_{metric}'

In [262]:
other_frameworks = [framework for framework in df.columns if framework != main]

In [263]:
significance_level = 0.05

In [264]:
significance_level = significance_level / number_comparisons

In [265]:
def get_stats_test(df, main, main_metric,  other_frameworks, significance_level, symbol):
    for framework in other_frameworks:
        framework_name = framework
        if 'all' in framework:
            data_var = 'all_data_'
            framework_name = framework.strip('-all')
        else:
            data_var = ''
        for dataset in df.index:
            framework_name = framework
            if 'all' in framework:
                data_var = 'all_data_'
                framework_name = framework.strip('-all')
            try:
                main_df = pd.read_csv(os.path.join(source_path, dataset, f'{main}-results.csv'))
            except Exception as e:
                continue
            if df.loc[dataset, framework] == '-':
                continue
            try:
                other_df = pd.read_csv(os.path.join(source_path, dataset, f'{framework_name}-results.csv'))
            except Exception as e:
                continue
            metric_name = metric
            if metric == 'num_pipelines_tested' and framework == 'flaml':
                metric_name = 'num_iterations'
            elif metric == 'num_pipelines_tested' and 'tpot' in framework:
                metric_name = 'total_evaluated'

            if 'tpot' in framework:
                framework_name = 'tpot'

            aux_data_var = data_var
            if framework == 'tpot':
                aux_data_var = ''
            if 'edca' in framework:
                framework_name = 'evo'

            if 'flaml' in framework and 'edca' in framework:
                framework_name = 'flaml_with_edca'
                aux_data_var = ''

            if 'tpot' in framework and 'edca' in framework:
                framework_name = 'tpot_with_edca'
                aux_data_var = ''
            other_metric = f'{framework_name}_{aux_data_var}{metric_name}'
            a = main_df[main_metric].round(round_digits)
            b = other_df[other_metric].round(round_digits)
            a = a.loc[a.notnull() & b.notnull()]
            b = b.loc[a.notnull() & b.notnull()]
            try:
                st = stats_test(a, b, significance_level=significance_level)
                if st:
                    if st.pvalue <= significance_level:
                        if symbol == None:
                            df.loc[dataset, framework] = f'\\textbf{{{df.loc[dataset, framework]}}}'
                        else:
                            df.loc[dataset, framework] = f'{df.loc[dataset, framework]}{symbol}'
            except:
                continue

In [266]:
# compare all with EDCA
get_stats_test(df, main, main_metric, other_frameworks, significance_level, None)

In [267]:
df

Unnamed: 0,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02},\textbf{0.43±0.02}
Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04},\textbf{0.64±0.1}
adult,0.63±0.0,0.63±0.01,,\textbf{0.63±0.0},
bank-marketing,0.5±0.01,\textbf{0.51±0.01},,\textbf{0.51±0.01},
cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01},\textbf{0.92±0.03}
credit-g,0.32±0.04,0.34±0.03,,\textbf{0.24±0.07},
mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0},\textbf{0.96±0.01}


In [268]:
# compare FLAML with flaml-edca-1-0-0
if 'flaml-edca-1-0-0' in df.columns:
    main = 'flaml'
    main_metric = f'flaml_{metric}'
    other_frameworks = ['flaml-edca-1-0-0']
    get_stats_test(df, main, main_metric, other_frameworks, significance_level, '*')
    df

In [269]:
if 'flaml-all' in df.columns:
    main = 'flaml'
    main_metric = f'flaml_{metric}'
    other_frameworks = ['flaml-all']
    get_stats_test(df, main, main_metric, other_frameworks, significance_level, '*')

In [270]:
if 'tpot-edca-1-0-0' in df.columns:
    main = 'tpot'
    main_metric = f'tpot_{metric}'
    other_frameworks = ['tpot-edca-1-0-0']
    get_stats_test(df, main, main_metric, other_frameworks, significance_level, '*')
    df

In [271]:
df

Unnamed: 0,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02},\textbf{0.43±0.02}*
Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04}*,\textbf{0.64±0.1}*
adult,0.63±0.0,0.63±0.01,,\textbf{0.63±0.0},
bank-marketing,0.5±0.01,\textbf{0.51±0.01},,\textbf{0.51±0.01},
cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01}*,\textbf{0.92±0.03}*
credit-g,0.32±0.04,0.34±0.03,,\textbf{0.24±0.07}*,
mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0}*,\textbf{0.96±0.01}*


In [272]:
df = df.set_index(pd.Series(df.index, name='Dataset'))
df = df.reset_index()

In [273]:
df.to_csv(os.path.join('..', 'data', f'data-mean-results-{metric}.csv'), index=False)

## add arrows

In [274]:
df

Unnamed: 0,Dataset,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
0,Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02},\textbf{0.43±0.02}*
1,Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04}*,\textbf{0.64±0.1}*
2,adult,0.63±0.0,0.63±0.01,,\textbf{0.63±0.0},
3,bank-marketing,0.5±0.01,\textbf{0.51±0.01},,\textbf{0.51±0.01},
4,cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01}*,\textbf{0.92±0.03}*
5,credit-g,0.32±0.04,0.34±0.03,,\textbf{0.24±0.07}*,
6,mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0}*,\textbf{0.96±0.01}*


In [275]:
def get_arrows(original, final):
    for index, (original_value, final_value) in enumerate(zip(original, final)):
        if not (pd.isna(original_value) and pd.isna(final_value)):
            if 'textbf' in final_value:
                final_val = float(final_value.split('\\textbf{')[1].split(symbol)[0])
            else:
                final_val = float(final_value.split(symbol)[0])
            if 'textbf' in original_value:
                original_val = float(original_value.split('\\textbf{')[1].split(symbol)[0])
            else:
                original_val = float(original_value.split(symbol)[0])

            if final_val > original_val:
                final[index] = f'{final_value}$\\uparrow$'
            elif final_val < original_val:
                final[index] = f'{final_value}$\\downarrow$'
            else:
                final[index] = f'{final_value}'
    return final


In [276]:
edca_framework = 'edca-1-0-0'

In [277]:
for col in df.columns:
    if 'flaml' in col and 'edca' in col:
        df[col] = get_arrows(original=df['flaml'], final=df[col])
    if 'tpot' in col and 'edca' in col:
        df[col] = get_arrows(original=df['tpot'], final=df[col])
    if 'edca' in col and 'all' in col:
        df[col] = get_arrows(original=df[edca_framework], final=df[col])
    if 'flaml' in col and 'all' in col:
        df[col] = get_arrows(original=df['flaml'], final=df[col])

In [278]:
df

Unnamed: 0,Dataset,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
0,Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02}$\downarrow$,\textbf{0.43±0.02}*$\downarrow$
1,Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04}*$\downarrow$,\textbf{0.64±0.1}*$\downarrow$
2,adult,0.63±0.0,0.63±0.01,,\textbf{0.63±0.0},
3,bank-marketing,0.5±0.01,\textbf{0.51±0.01},,\textbf{0.51±0.01},
4,cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01}*$\downarrow$,\textbf{0.92±0.03}*$\downarrow$
5,credit-g,0.32±0.04,0.34±0.03,,\textbf{0.24±0.07}*$\downarrow$,
6,mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0}*$\downarrow$,\textbf{0.96±0.01}*$\downarrow$


## convert to latex table

In [279]:
df.replace(np.nan, '-', inplace=True)

In [280]:
df

Unnamed: 0,Dataset,edca-1-0-0,flaml,tpot,flaml-edca-1-0-0,tpot-edca-1-0-0
0,Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02}$\downarrow$,\textbf{0.43±0.02}*$\downarrow$
1,Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04}*$\downarrow$,\textbf{0.64±0.1}*$\downarrow$
2,adult,0.63±0.0,0.63±0.01,-,\textbf{0.63±0.0},-
3,bank-marketing,0.5±0.01,\textbf{0.51±0.01},-,\textbf{0.51±0.01},-
4,cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01}*$\downarrow$,\textbf{0.92±0.03}*$\downarrow$
5,credit-g,0.32±0.04,0.34±0.03,-,\textbf{0.24±0.07}*$\downarrow$,-
6,mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0}*$\downarrow$,\textbf{0.96±0.01}*$\downarrow$


## statistical test

In [281]:
columns = []
for col in df.columns:
    if col == 'Dataset':
        columns.append(f'\\textbf{{{col}}}')
    else:
        columns.append('\\textbf{' + frameworks_map[col] + '}')
df.columns = columns

In [282]:
df

Unnamed: 0,\textbf{Dataset},\textbf{EDCA},\textbf{FLAML},\textbf{TPOT},\textbf{FLAML+EDCA},\textbf{TPOT+EDCA}
0,Amazon_employee_access,0.44±0.01,\textbf{0.42±0.01},0.46±0.03,\textbf{0.41±0.02}$\downarrow$,\textbf{0.43±0.02}*$\downarrow$
1,Australian,0.72±0.01,0.72±0.02,0.72±0.02,\textbf{0.69±0.04}*$\downarrow$,\textbf{0.64±0.1}*$\downarrow$
2,adult,0.63±0.0,0.63±0.01,-,\textbf{0.63±0.0},-
3,bank-marketing,0.5±0.01,\textbf{0.51±0.01},-,\textbf{0.51±0.01},-
4,cnae-9,0.93±0.01,0.93±0.01,\textbf{0.94±0.01},\textbf{0.91±0.01}*$\downarrow$,\textbf{0.92±0.03}*$\downarrow$
5,credit-g,0.32±0.04,0.34±0.03,-,\textbf{0.24±0.07}*$\downarrow$,-
6,mfeat-factors,0.97±0.0,\textbf{0.96±0.0},0.97±0.0,\textbf{0.95±0.0}*$\downarrow$,\textbf{0.96±0.01}*$\downarrow$


In [283]:
column_format = 'r|' * df.shape[1]
column_format = column_format.removesuffix('|')

In [284]:
latex = df.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.1f}".format,
                  column_format=column_format)

In [285]:
print(latex)

\begin{tabular}{r|r|r|r|r|r}
\toprule
\textbf{Dataset} & \textbf{EDCA} & \textbf{FLAML} & \textbf{TPOT} & \textbf{FLAML+EDCA} & \textbf{TPOT+EDCA} \\
\midrule
Amazon_employee_access & 0.44±0.01 & \textbf{0.42±0.01} & 0.46±0.03 & \textbf{0.41±0.02}$\downarrow$ & \textbf{0.43±0.02}*$\downarrow$ \\
Australian & 0.72±0.01 & 0.72±0.02 & 0.72±0.02 & \textbf{0.69±0.04}*$\downarrow$ & \textbf{0.64±0.1}*$\downarrow$ \\
adult & 0.63±0.0 & 0.63±0.01 & - & \textbf{0.63±0.0} & - \\
bank-marketing & 0.5±0.01 & \textbf{0.51±0.01} & - & \textbf{0.51±0.01} & - \\
cnae-9 & 0.93±0.01 & 0.93±0.01 & \textbf{0.94±0.01} & \textbf{0.91±0.01}*$\downarrow$ & \textbf{0.92±0.03}*$\downarrow$ \\
credit-g & 0.32±0.04 & 0.34±0.03 & - & \textbf{0.24±0.07}*$\downarrow$ & - \\
mfeat-factors & 0.97±0.0 & \textbf{0.96±0.0} & 0.97±0.0 & \textbf{0.95±0.0}*$\downarrow$ & \textbf{0.96±0.01}*$\downarrow$ \\
\bottomrule
\end{tabular}



In [286]:
print('\\resizebox{\\textwidth}{!}{')
print(latex)
print('}')

\resizebox{\textwidth}{!}{
\begin{tabular}{r|r|r|r|r|r}
\toprule
\textbf{Dataset} & \textbf{EDCA} & \textbf{FLAML} & \textbf{TPOT} & \textbf{FLAML+EDCA} & \textbf{TPOT+EDCA} \\
\midrule
Amazon_employee_access & 0.44±0.01 & \textbf{0.42±0.01} & 0.46±0.03 & \textbf{0.41±0.02}$\downarrow$ & \textbf{0.43±0.02}*$\downarrow$ \\
Australian & 0.72±0.01 & 0.72±0.02 & 0.72±0.02 & \textbf{0.69±0.04}*$\downarrow$ & \textbf{0.64±0.1}*$\downarrow$ \\
adult & 0.63±0.0 & 0.63±0.01 & - & \textbf{0.63±0.0} & - \\
bank-marketing & 0.5±0.01 & \textbf{0.51±0.01} & - & \textbf{0.51±0.01} & - \\
cnae-9 & 0.93±0.01 & 0.93±0.01 & \textbf{0.94±0.01} & \textbf{0.91±0.01}*$\downarrow$ & \textbf{0.92±0.03}*$\downarrow$ \\
credit-g & 0.32±0.04 & 0.34±0.03 & - & \textbf{0.24±0.07}*$\downarrow$ & - \\
mfeat-factors & 0.97±0.0 & \textbf{0.96±0.0} & 0.97±0.0 & \textbf{0.95±0.0}*$\downarrow$ & \textbf{0.96±0.01}*$\downarrow$ \\
\bottomrule
\end{tabular}

}


In [287]:
df.to_csv('results-mean.csv', index=False)