# Comparing Strategies

In [1]:
import altair as alt
import pandas as pd
from performance_visualizer import load_search_performances, plot_search_performances

In [10]:
file_path = 'resource/ablation_full.json'
full_performances = load_search_performances(file_path, 'Automatic Grammar')

In [11]:
file_path = 'resource/ablation_notuning.json'
notuning_performances = load_search_performances(file_path, 'No Tuning')

In [12]:
file_path = 'resource/ablation_noprioritization.json'
nopriorization_performances = load_search_performances(file_path, 'No Prioritization')

In [13]:
file_path = 'resource/ablation_noautogrammar.json'
noautomatic_performances = load_search_performances(file_path, 'Manual Grammar')

In [6]:
dataset = '1567_poker_hand_MIN_METADATA'

In [16]:
#all_performances = pd.concat([full_performances, notuning_performances, nopriorization_performances, noautomatic_performances], ignore_index=True)
all_performances = pd.concat([full_performances, noautomatic_performances], ignore_index=True)
plot_search_performances(all_performances, dataset)

In [8]:
all_performances


Unnamed: 0,dataset,method,score,time
0,185_baseball_MIN_METADATA,Full,0.687978,2022-03-03 00:23:23.207824
1,185_baseball_MIN_METADATA,Full,0.676509,2022-03-03 00:43:16.175425
2,185_baseball_MIN_METADATA,Full,0.673911,2022-03-03 00:08:31.297611
3,185_baseball_MIN_METADATA,Full,0.673911,2022-03-03 00:58:57.215141
4,185_baseball_MIN_METADATA,Full,0.670778,2022-03-03 00:05:18.502620
...,...,...,...,...
625,LL0_186_braziltourism_MIN_METADATA,No Automatic Grammar,0.092257,2022-03-03 00:17:49.388830
626,LL0_186_braziltourism_MIN_METADATA,No Automatic Grammar,0.063329,2022-03-03 00:46:52.110210
627,LL0_186_braziltourism_MIN_METADATA,No Automatic Grammar,0.060314,2022-03-03 00:39:05.460236
628,LL0_186_braziltourism_MIN_METADATA,No Automatic Grammar,0.009380,2022-03-03 00:13:27.470172


In [24]:
df = all_performances
#df = df[df.dataset != 'LL0_186_braziltourism_MIN_METADATA']
#df = df[df.dataset != 'LL1_GS_process_classification_tabular_MIN_METADATA']
df = df.replace('185_baseball_MIN_METADATA', 'dataset_1')
df = df.replace('299_libras_move_MIN_METADATA', 'dataset_2')
df = df.replace('1567_poker_hand_MIN_METADATA', 'dataset_3')
df = df.replace('LL1_GS_process_classification_tabular_MIN_METADATA', 'dataset_4')
df = df.replace('LL0_186_braziltourism_MIN_METADATA', 'dataset_5')

bars = alt.Chart().mark_point(filled=True, size=40).encode(
   x=alt.X('method', scale=alt.Scale(zero=True), axis=alt.Axis(grid=False, title=None, labels=False, ticks=False)),
   y=alt.Y('score', axis=alt.Axis(grid=False), aggregate='mean'),
   color=alt.Color('method', legend=alt.Legend(title='', orient='none',
                    legendX=50, legendY=-20,
                    direction='horizontal',
                    titleAnchor='middle')),
)

limits = alt.Chart().mark_errorbar(extent='stdev').encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', scale=alt.Scale(zero=False), title='Scores'),
  color=alt.Color('method')
)

alt.layer(
    bars,
    limits,
    data=df
).facet(
    column=alt.Column('dataset:N', header=alt.Header(title=None, labelOrient='bottom')),
).configure_view(
    strokeWidth=0.0,
    continuousWidth=10,
    continuousHeight=180,
).configure_title(
    fontSize=11,
    anchor='middle',
    color='black',
    orient='bottom'
).properties(
    title='Datasets'
)

In [9]:
import os
from os.path import join, exists
def collect_new_scores(mode):
    new_scores = {}
    folder_path = '../../evaluations/results/%s/' % mode
    datasets = sorted([x for x in os.listdir(folder_path) if os.path.isdir(join(folder_path, x))])

    for dataset in datasets:
        csv_path = join(folder_path, dataset, 'output/temp/statistics_datasets.csv')
        if exists(csv_path):
            data = pd.read_csv(csv_path, header=None, sep='\t')
            data = data.replace({'None': None})
            score = data.iloc[0][4]
            metric = data.iloc[0][5]
            if score is not None:
                score = round(float(score), 3)
                new_scores[dataset] = {'score': score, 'metric': metric}
    new_scores = [(i, j['score']) for i,j in sorted(new_scores.items())]
    return new_scores

In [14]:
print(collect_new_scores('img-full'))

[('1567_poker_hand_MIN_METADATA', 0.273), ('185_baseball_MIN_METADATA', 0.688), ('299_libras_move_MIN_METADATA', 0.981), ('LL0_186_braziltourism_MIN_METADATA', 0.145), ('LL1_GS_process_classification_tabular_MIN_METADATA', 0.788)]


In [15]:
print(collect_new_scores('img-no_autogrammar'))

[('299_libras_move_MIN_METADATA', 0.98), ('LL0_186_braziltourism_MIN_METADATA', 0.159)]


In [16]:
print(collect_new_scores('img-no_priorization'))

[('1567_poker_hand_MIN_METADATA', 0.131), ('185_baseball_MIN_METADATA', 0.699), ('299_libras_move_MIN_METADATA', 0.975), ('LL0_186_braziltourism_MIN_METADATA', 0.145), ('LL1_GS_process_classification_tabular_MIN_METADATA', 0.788)]


In [17]:
print(collect_new_scores('img-no_tuning'))

[('1567_poker_hand_MIN_METADATA', 0.273), ('185_baseball_MIN_METADATA', 0.613), ('299_libras_move_MIN_METADATA', 0.981), ('LL0_186_braziltourism_MIN_METADATA', 0.145), ('LL1_GS_process_classification_tabular_MIN_METADATA', 0.803)]


In [18]:
alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_boxplot().encode(
    x=alt.X('method:N'),
    y=alt.Y('score:Q', scale=alt.Scale(zero=False))
).properties(width=300, height=300, title='Media +/- Standard Deviation')

In [52]:
error_bars = alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_errorbar(extent='stdev').encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', scale=alt.Scale(zero=False))
)

points = alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_point(filled=True).encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', aggregate='mean'),
)

error_bars + points

In [55]:
pipelines_counter = all_performances.groupby(['dataset', 'method']).size().reset_index(name='pipelines')

In [56]:
pipelines_counter

Unnamed: 0,dataset,method,pipelines
0,1567_poker_hand_MIN_METADATA,Full,50
1,1567_poker_hand_MIN_METADATA,No Automatic Grammar,50
2,1567_poker_hand_MIN_METADATA,No Prioritization,50
3,1567_poker_hand_MIN_METADATA,No Tuning,50
4,185_baseball_MIN_METADATA,Full,50
5,185_baseball_MIN_METADATA,No Automatic Grammar,1
6,185_baseball_MIN_METADATA,No Prioritization,50
7,185_baseball_MIN_METADATA,No Tuning,50
8,299_libras_move_MIN_METADATA,Full,50
9,299_libras_move_MIN_METADATA,No Automatic Grammar,50


In [57]:
bars = alt.Chart(pipelines_counter[pipelines_counter['dataset'] == dataset]).mark_bar().encode(x='method:N', y='pipelines:Q')
text = bars.mark_text(align='center', baseline='middle').encode(text='pipelines:Q')
(bars + text).properties(height=200, title='Number of Pipelines')