# Analyse the best solutions achieved (overall best)

Notebook to get the best solutions overall by each AutoML framework (EDCA, FLAML, TPOT). It gets the % data used, metrics values and the individuals.

In [2]:
import pandas as pd
import os
import json
import pprint

In [3]:
datasets = list(sorted(['adult', 'Australian', 'cnae-9', 'credit-g', 'kr-vs-kp', 'mfeat-factors', 'bank-marketing', 'Amazon_employee_access']))

In [149]:
# EDCA best solution

In [5]:
edca_best_models = {}
for dataset in datasets:
    models = []
    mccs = []
    sample_percentage = []
    feature_percentage = []
    for run in range(30):
        path = os.path.join('..', 'thesis-results', 'datasets-divided', 'edca-1-0-0', dataset, f'run_{run}', 'results.json')
        with open(path, 'r') as f:
            results = json.load(f)
            models = models + results['evo_best']
            mccs = mccs + results['evo_mcc']
            sample_percentage = sample_percentage + results['evo_sample_%']
            feature_percentage = feature_percentage + results['evo_features_%']
    edca_best_models[dataset] = {
        'mcc' : mccs,
        'models' : models,
        'sample_percentage' : sample_percentage,
        'feature_percentage' : feature_percentage
    }

In [6]:
edca_values = {}

In [7]:
for dataset in datasets:
    print()
    print(dataset)
    df = pd.DataFrame(edca_best_models[dataset])
    best = df.loc[df.mcc == df.mcc.max()]
    print('MCC', best.mcc.values[0])
    best_json = best.models.values[0]
    if 'sample' in best_json:
        _ = best_json.pop('sample')
    if 'features' in best_json:
        _ = best_json.pop('features')
    print('sample_percentage', best.sample_percentage.values[0])
    print('feature_percentage', best.feature_percentage.values[0])
    aux = {}
    aux['sample_percentage'] = round(best.sample_percentage.values[0], 2)
    aux['feature_percentage'] = round(best.feature_percentage.values[0], 2)
    for key, value in best_json.items():
        if isinstance(value, dict):
            aux[key] = list(value.keys())[0]
        else:
            aux[key] = value
    edca_values[dataset] = aux


Amazon_employee_access
MCC 0.5046705324188538
sample_percentage 0.75
feature_percentage 0.5555555555555556

Australian
MCC 0.853097721950181
sample_percentage 0.3278985507246377
feature_percentage 1.0

adult
MCC 0.6571655945186987
sample_percentage 0.6276040333725751
feature_percentage 1.0

bank-marketing
MCC 0.5507043178463852
sample_percentage 0.64035500013824
feature_percentage 0.6875

cnae-9
MCC 0.9741932655154736
sample_percentage 0.75
feature_percentage 1.0

credit-g
MCC 0.466857415166512
sample_percentage 0.4975
feature_percentage 1.0

kr-vs-kp
MCC 1.0
sample_percentage 0.7497066875244427
feature_percentage 1.0

mfeat-factors
MCC 0.9833947974289856
sample_percentage 0.75
feature_percentage 0.7777777777777778


In [8]:
edca_df = pd.DataFrame(edca_values).T
edca_df.reset_index(inplace=True)
edca_df = edca_df.rename(columns={'index': 'dataset'})
edca_df.insert(1, 'Framework', 'EDCA')

In [9]:
edca_df

Unnamed: 0,dataset,Framework,sample_percentage,feature_percentage,scaler,model,categorical-imputer,encoder
0,Amazon_employee_access,EDCA,0.75,0.56,MinMaxScaler,RandomForestClassifier,,
1,Australian,EDCA,0.33,1.0,MinMaxScaler,XGBClassifier,,
2,adult,EDCA,0.63,1.0,StandardScaler,XGBClassifier,SimpleImputer,OneHotEncoder
3,bank-marketing,EDCA,0.64,0.69,MinMaxScaler,XGBClassifier,,OneHotEncoder
4,cnae-9,EDCA,0.75,1.0,StandardScaler,LogisticRegression,,
5,credit-g,EDCA,0.5,1.0,RobustScaler,LogisticRegression,,OneHotEncoder
6,kr-vs-kp,EDCA,0.75,1.0,,RandomForestClassifier,,OneHotEncoder
7,mfeat-factors,EDCA,0.75,0.78,StandardScaler,LogisticRegression,,


In [155]:
# FLAML bests

In [10]:
flaml_best_models = {}
for dataset in datasets:
    models = []
    mccs = []
    sample_percentage = []
    feature_percentage = []
    for run in range(30):
        path = os.path.join('..', 'thesis-results', 'datasets-divided', 'flaml', dataset, f'run_{run}', 'results.json')
        with open(path, 'r') as f:
            results = json.load(f)
            models = models + results['flaml_best_learner']
            mccs = mccs + results['flaml_mcc']
            sample_percentage = sample_percentage + results['flaml_sample_%']
            feature_percentage = feature_percentage + results['flaml_features_%']
    flaml_best_models[dataset] = {
        'mcc' : mccs,
        'models' : models,
        'sample_percentage' : sample_percentage,
        'feature_percentage' : feature_percentage
    }

In [11]:
flaml_values = {}
for dataset in datasets:
    print()
    print(dataset)
    df = pd.DataFrame(flaml_best_models[dataset])
    try:
        best = df.loc[df.mcc == df.mcc.max()]
        print('MCC', best.mcc.values[0])
        print(best.models.values[0])
        print('sample_percentage', best.sample_percentage.values[0])
        print('feature_percentage', best.feature_percentage.values[0])
        aux = {}
        aux['sample_percentage'] = round(best.sample_percentage.values[0], 2)
        aux['feature_percentage'] = round(best.feature_percentage.values[0], 2)
        aux['model'] = best.models.values[0]
        flaml_values[dataset] = aux
    except:
        flaml_values[dataset] = {}
    


Amazon_employee_access
MCC 0.4832211417745258
lgbm
sample_percentage 0.7499904634751097
feature_percentage 1.0

Australian
MCC 0.8535852770771386
xgb_limitdepth
sample_percentage 0.75
feature_percentage 1.0

adult
MCC 0.6585489292783278
lgbm
sample_percentage 0.7499808051595731
feature_percentage 1.0

bank-marketing
MCC 0.5534720079137596
lgbm
sample_percentage 0.7499792640106169
feature_percentage 1.0

cnae-9
MCC 0.968820085006422
xgb_limitdepth
sample_percentage 0.75
feature_percentage 1.0

credit-g
MCC 0.5773728780150095
lgbm
sample_percentage 0.75
feature_percentage 1.0

kr-vs-kp

mfeat-factors
MCC 0.98615220164225
xgboost
sample_percentage 0.75
feature_percentage 1.0


In [12]:
flaml_df = pd.DataFrame(flaml_values).T
flaml_df.reset_index(inplace=True)
flaml_df = flaml_df.rename(columns={'index': 'dataset'})
flaml_df.insert(1, 'Framework', 'FLAML')

In [13]:
flaml_df

Unnamed: 0,dataset,Framework,sample_percentage,feature_percentage,model
0,Amazon_employee_access,FLAML,0.75,1.0,lgbm
1,Australian,FLAML,0.75,1.0,xgb_limitdepth
2,adult,FLAML,0.75,1.0,lgbm
3,bank-marketing,FLAML,0.75,1.0,lgbm
4,cnae-9,FLAML,0.75,1.0,xgb_limitdepth
5,credit-g,FLAML,0.75,1.0,lgbm
6,kr-vs-kp,FLAML,,,
7,mfeat-factors,FLAML,0.75,1.0,xgboost


In [14]:
flaml_models_map = {
    'lgbm' : 'LGBMClassifier',
    'xgboost' : 'XGBClassifier',
    'xgb_limitdepth' : 'XGBClassifier',
    'rf' : 'RandomForestClassifier',
    'lrl1' : 'LogisticRegression',
    'lrl2' : 'LogisticRegression',
    'kneighbor' : 'KNeighborsClassifier',
    'extra_tree' : 'ExtraTreesClassifier',
}

In [15]:
flaml_df.model = flaml_df.model.map(flaml_models_map)

In [16]:
# TPOT bests
tpot_best_models = {}
for dataset in datasets:
    models = []
    mccs = []
    sample_percentage = []
    feature_percentage = []
    for run in range(30):
        path = os.path.join('..', 'thesis-results', 'datasets-divided', 'tpot', dataset, f'run_{run}', 'results.json')
        with open(path, 'r') as f:
            results = json.load(f)
            models = models + results['tpot_best_pipeline']
            mccs = mccs + results['tpot_mcc']
            sample_percentage = sample_percentage + results['tpot_sample_%']
            feature_percentage = feature_percentage + results['tpot_features_%']
    tpot_best_models[dataset] = {
        'mcc' : mccs,
        'models' : models,
        'sample_percentage' : sample_percentage,
        'feature_percentage' : feature_percentage
    }

In [17]:
tpot_values = {}
for dataset in datasets:
    print()
    print(dataset)
    try:
        df = pd.DataFrame(tpot_best_models[dataset])
        best = df.loc[df.mcc == df.mcc.max()]
        print('MCC', best.mcc.values[0])
        print(best.models.values[0])
        print('sample_percentage', best.sample_percentage.values[0])
        print('feature_percentage', best.feature_percentage.values[0])
        tpot_values[dataset] = best.models.values[0]
    except:
        tpot_values[dataset] = None


Amazon_employee_access
MCC 0.5377234798413941
[('stackingestimator', StackingEstimator(estimator=RandomForestClassifier(bootstrap=False,
                                                   criterion='entropy',
                                                   max_features=0.25,
                                                   min_samples_leaf=4,
                                                   min_samples_split=8,
                                                   random_state=522))), ('extratreesclassifier', ExtraTreesClassifier(max_features=0.35000000000000003, min_samples_leaf=13,
                     min_samples_split=18, random_state=522))]
sample_percentage 1.0
feature_percentage 1.0

Australian
MCC 0.8271742062290949
[('rfe', RFE(estimator=ExtraTreesClassifier(max_features=0.3, random_state=28),
    step=0.7000000000000001)), ('lgbmclassifier', LGBMClassifier(colsample_bytree=0.5, learning_rate=0.01, min_child_samples=22,
               n_estimators=377, num_leaves=719, ran

In [20]:
tpot_values = {
    'Amazon_employee_access' : {
        'sample_percentage' : 1.0,
        'feature_percentage' : 1.0,
        'model' : 'StackingEstimator (RandomForestClassifier, ExtraTreesClassifier)'
    },
    'adult' : {},
    'Australian' : {
        'sample_percentage' : 1.0,
        'feature_percentage' : f'0.5 (RFE)',
        'model' : 'LGBMClassifier'
    },
    'bank-marketing' : {},
    'cnae-9' : {
        'sample_percentage' : 1.0, 
        'feature_percentage' : 1.0,
        'model' : 'StackingEstimator (ExtraTreesClassifier, StandardScaler, LogisticRegression)'
    },
    'credit-g' : {},
    'kr-vs-kp' : {},
    'mfeat-factors' : {
        'sample_percentage' : 1.0, 
        'feature_percentage' : 1.0,
        'scaler' : 'StandardScaler',
        'model' : 'LogisticRegression'
    }

}

In [21]:
tpot_df = pd.DataFrame(tpot_values).T
tpot_df.reset_index(inplace=True)
tpot_df = tpot_df.rename(columns={'index': 'dataset'})
tpot_df.insert(1, 'Framework', 'TPOT')
tpot_df

Unnamed: 0,dataset,Framework,sample_percentage,feature_percentage,model,scaler
0,Amazon_employee_access,TPOT,1.0,1.0,"StackingEstimator (RandomForestClassifier, Ext...",
1,adult,TPOT,,,,
2,Australian,TPOT,1.0,0.5 (RFE),LGBMClassifier,
3,bank-marketing,TPOT,,,,
4,cnae-9,TPOT,1.0,1.0,"StackingEstimator (ExtraTreesClassifier, Stand...",
5,credit-g,TPOT,,,,
6,kr-vs-kp,TPOT,,,,
7,mfeat-factors,TPOT,1.0,1.0,LogisticRegression,StandardScaler


In [22]:
models = pd.concat([edca_df, flaml_df, tpot_df])
models.dataset = models.dataset.str.lower()
models = models.sort_values(by=['dataset', 'Framework'])
models

Unnamed: 0,dataset,Framework,sample_percentage,feature_percentage,scaler,model,categorical-imputer,encoder
2,adult,EDCA,0.63,1.0,StandardScaler,XGBClassifier,SimpleImputer,OneHotEncoder
2,adult,FLAML,0.75,1.0,,LGBMClassifier,,
1,adult,TPOT,,,,,,
0,amazon_employee_access,EDCA,0.75,0.56,MinMaxScaler,RandomForestClassifier,,
0,amazon_employee_access,FLAML,0.75,1.0,,LGBMClassifier,,
0,amazon_employee_access,TPOT,1.0,1.0,,"StackingEstimator (RandomForestClassifier, Ext...",,
1,australian,EDCA,0.33,1.0,MinMaxScaler,XGBClassifier,,
1,australian,FLAML,0.75,1.0,,XGBClassifier,,
2,australian,TPOT,1.0,0.5 (RFE),,LGBMClassifier,,
3,bank-marketing,EDCA,0.64,0.69,MinMaxScaler,XGBClassifier,,OneHotEncoder


In [23]:
import numpy as np

In [24]:
for col in models.columns:
    models.loc[models[col].isnull(), col] = '-'
    models[col] = models[col].astype(str)

In [25]:
column_format = 'r|' * models.shape[1]
column_format = column_format.removesuffix('|')

In [26]:
latex = models.to_latex(index=False,
                  formatters={"name": str.upper},
                  float_format="{:.2f}".format,
                  column_format=column_format)

In [27]:
print(latex)

\begin{tabular}{r|r|r|r|r|r|r|r}
\toprule
dataset & Framework & sample_percentage & feature_percentage & scaler & model & categorical-imputer & encoder \\
\midrule
adult & EDCA & 0.63 & 1.0 & StandardScaler & XGBClassifier & SimpleImputer & OneHotEncoder \\
adult & FLAML & 0.75 & 1.0 & - & LGBMClassifier & - & - \\
adult & TPOT & - & - & - & - & - & - \\
amazon_employee_access & EDCA & 0.75 & 0.56 & MinMaxScaler & RandomForestClassifier & - & - \\
amazon_employee_access & FLAML & 0.75 & 1.0 & - & LGBMClassifier & - & - \\
amazon_employee_access & TPOT & 1.0 & 1.0 & - & StackingEstimator (RandomForestClassifier, ExtraTreesClassifier) & - & - \\
australian & EDCA & 0.33 & 1.0 & MinMaxScaler & XGBClassifier & - & - \\
australian & FLAML & 0.75 & 1.0 & - & XGBClassifier & - & - \\
australian & TPOT & 1.0 & 0.5 (RFE) & - & LGBMClassifier & - & - \\
bank-marketing & EDCA & 0.64 & 0.69 & MinMaxScaler & XGBClassifier & - & OneHotEncoder \\
bank-marketing & FLAML & 0.75 & 1.0 & - & LGBMClassifi