In [13]:
from alphad3m.metalearning.grammar_builder import load_related_pipelines, extract_patterns
from IPython.display import IFrame

# From Manual to Automatic Grammar

#### Manual grammar:

S -> IMPUTATION ENCODERS FEATURE_SCALING FEATURE_SELECTION CLASSIFICATION
ENCODERS -> TEXT_FEATURIZER CATEGORICAL_ENCODER
IMPUTATION -> 'd3m.primitives.data_cleaning.imputer.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.generic_univariate_select.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.select_fwe.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.select_percentile.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.variance_threshold.SKlearn'
FEATURE_SELECTION -> 'E'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.max_abs_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.min_max_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.robust_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.standard_scaler.SKlearn'
FEATURE_SCALING -> 'E'
CLASSIFICATION -> 'd3m.primitives.classification.ada_boost.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.bagging.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.bernoulli_naive_bayes.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.decision_tree.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.extra_trees.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.gaussian_naive_bayes.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.gradient_boosting.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.linear_discriminant_analysis.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.linear_svc.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.logistic_regression.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.multinomial_naive_bayes.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.passive_aggressive.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.quadratic_discriminant_analysis.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.random_forest.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.sgd.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.svc.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.light_gbm.Common'
CLASSIFICATION -> 'd3m.primitives.classification.xgboost_dart.Common'
CLASSIFICATION -> 'd3m.primitives.classification.xgboost_gbtree.Common'
CATEGORICAL_ENCODER -> 'd3m.primitives.data_transformation.one_hot_encoder.SKlearn'
TEXT_FEATURIZER -> 'd3m.primitives.feature_extraction.tfidf_vectorizer.SKlearn'
TEXT_FEATURIZER -> 'd3m.primitives.data_transformation.encoder.DistilTextEncoder'
TEXT_FEATURIZER -> 'd3m.primitives.feature_construction.corex_text.DSBOX'

#### Steps to build the automatic grammar:

1. Select pipelines that solve similar problems (task-oriented)
    - Info from problem: task keywords
    - Info from dataset: semantic types, missing values
2. Create a portfolio of pipelines
    - Auto-Sklearn uses a metric named "distance to minimum" (transform the performances to the distance to the 
      best observed performance)
3. Transform the pipelines to their types
    - 'imputer.SKlearn 'random_forest.SKlearn' -> IMPUTATION CLASSIFICATION
4. Use "primitive importance" to prioritize primitives
    - Try first Random Forest, then xgboost, etc.

In [39]:
dataset_path = '/Users/rlopez/D3M/datasets/seed_datasets_current/185_baseball_MIN_METADATA/TRAIN/dataset_TRAIN/tables/learningData.csv'
target_column = 'Hall_of_Fame'
task_keywords = ['classification', 'multiClass', 'tabular']
pipelines = load_related_pipelines(dataset_path, target_column, task_keywords)

2021-09-28 23:15:05,785 INFO Loading primitives info from file
2021-09-28 23:15:05,788 INFO Loading primitives info from file
2021-09-28 23:15:05,789 INFO Loading pipelines from metalearning database...
2021-09-28 23:15:28,564 INFO Found 917890 pipelines in metalearning database
2021-09-28 23:15:28,566 INFO File size: 119402 bytes
2021-09-28 23:15:28,568 INFO Loading dataframe...
2021-09-28 23:15:28,591 INFO Dataframe loaded, 1073 rows, 19 columns
2021-09-28 23:15:28,593 INFO Setting column names from header
2021-09-28 23:15:28,594 INFO Identifying types, 19 columns...
2021-09-28 23:15:28,596 INFO Processing column 0 'd3mIndex'...
2021-09-28 23:15:28,677 INFO Column type http://schema.org/Integer [http://schema.org/identifier]
2021-09-28 23:15:28,678 INFO Processing column 1 'Player'...
2021-09-28 23:15:28,719 INFO Column type http://schema.org/Text []
2021-09-28 23:15:28,719 INFO Processing column 2 'Number_seasons'...
2021-09-28 23:15:28,787 INFO Column type http://schema.org/Integer

#### Automatic grammar:

S -> NUMERICAL_METHOD DATA_CONVERSION CATEGORICAL_ENCODER DATA_CONVERSION IMPUTATION FEATURE_SCALING DATA_CONVERSION FEATURE_SELECTION CLASSIFICATION
S -> TEXT_FEATURIZER CATEGORICAL_ENCODER FEATURE_SCALING IMPUTATION FEATURE_SELECTION CLASSIFICATION
S -> IMPUTATION TEXT_FEATURIZER CATEGORICAL_ENCODER FEATURE_SCALING CLASSIFICATION
S -> IMPUTATION CATEGORICAL_ENCODER DATA_CONVERSION FEATURE_SCALING CLASSIFICATION
S -> IMPUTATION CATEGORICAL_ENCODER DIMENSIONALITY_REDUCTION CLASSIFICATION
S -> IMPUTATION CATEGORICAL_ENCODER FEATURE_SELECTION CLASSIFICATION
S -> FEATURE_SELECTION IMPUTATION CLASSIFICATION
CATEGORICAL_ENCODER -> 'd3m.primitives.data_transformation.encoder.DSBOX'
CATEGORICAL_ENCODER -> 'd3m.primitives.data_transformation.one_hot_encoder.DistilOneHotEncoder'
CATEGORICAL_ENCODER -> 'd3m.primitives.data_transformation.one_hot_encoder.SKlearn'
CATEGORICAL_ENCODER -> 'd3m.primitives.data_transformation.unary_encoder.DSBOX'
CATEGORICAL_ENCODER -> 'E'
CLASSIFICATION -> 'd3m.primitives.classification.ada_boost.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.bagging.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.decision_tree.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.extra_trees.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.gradient_boosting.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.k_neighbors.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.light_gbm.Common'
CLASSIFICATION -> 'd3m.primitives.classification.linear_discriminant_analysis.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.linear_svc.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.logistic_regression.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.mlp.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.nearest_centroid.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.random_forest.Common'
CLASSIFICATION -> 'd3m.primitives.classification.random_forest.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.search_hybrid.Find_projections'
CLASSIFICATION -> 'd3m.primitives.classification.sgd.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.svc.SKlearn'
CLASSIFICATION -> 'd3m.primitives.classification.xgboost_dart.Common'
CLASSIFICATION -> 'd3m.primitives.classification.xgboost_gbtree.Common'
DATA_CONVERSION -> 'd3m.primitives.data_cleaning.cleaning_featurizer.DSBOX'
DATA_CONVERSION -> 'd3m.primitives.data_transformation.to_numeric.DSBOX'
DATA_CONVERSION -> 'E'
DIMENSIONALITY_REDUCTION -> 'd3m.primitives.feature_extraction.pca.SKlearn'
DIMENSIONALITY_REDUCTION -> 'd3m.primitives.feature_selection.pca_features.Pcafeatures'
DIMENSIONALITY_REDUCTION -> 'E'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.min_max_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.quantile_transformer.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.robust_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.data_cleaning.standard_scaler.SKlearn'
FEATURE_SCALING -> 'd3m.primitives.normalization.iqr_scaler.DSBOX'
FEATURE_SCALING -> 'E'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.joint_mutual_information.AutoRPI'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.rffeatures.Rffeatures'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.select_fwe.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.select_percentile.SKlearn'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI'
FEATURE_SELECTION -> 'd3m.primitives.feature_selection.variance_threshold.SKlearn'
FEATURE_SELECTION -> 'E'
IMPUTATION -> 'd3m.primitives.data_cleaning.imputer.SKlearn'
IMPUTATION -> 'd3m.primitives.data_cleaning.iterative_regression_imputation.DSBOX'
IMPUTATION -> 'd3m.primitives.data_cleaning.mean_imputation.DSBOX'
IMPUTATION -> 'd3m.primitives.data_preprocessing.random_sampling_imputer.BYU'
IMPUTATION -> 'E'
NUMERICAL_METHOD -> 'd3m.primitives.data_transformation.do_nothing_for_dataset.DSBOX'
TEXT_FEATURIZER -> 'd3m.primitives.data_transformation.encoder.DistilTextEncoder'
TEXT_FEATURIZER -> 'E'

# Coverage of Patterns

#### Simple dataset:

In [24]:
task_name = 'CLASSIFICATION_TASK'
encoders = [] 
use_imputer = True
dataset_path = '/Users/rlopez/D3M/datasets/seed_datasets_current/LL0_186_braziltourism_MIN_METADATA/TRAIN/dataset_TRAIN/tables/learningData.csv'
target_column = 'Trips'
task_keywords = ['classification', 'multiClass', 'tabular']

In [26]:
manual_patterns = load_manual_patterns(task_name, encoders, use_imputer)

In [25]:
automatic_patterns = load_automatic_patterns(dataset_path, target_column, task_keywords)

2021-09-28 16:10:57,737 INFO Loading primitives info from file
2021-09-28 16:10:57,740 INFO Loading primitives info from file
2021-09-28 16:10:57,741 INFO Loading pipelines from metalearning database...
2021-09-28 16:11:36,990 INFO Found 917890 pipelines in metalearning database
2021-09-28 16:11:37,004 INFO File size: 16646 bytes
2021-09-28 16:11:37,007 INFO Loading dataframe...
2021-09-28 16:11:37,025 INFO Dataframe loaded, 330 rows, 10 columns
2021-09-28 16:11:37,026 INFO Setting column names from header
2021-09-28 16:11:37,027 INFO Identifying types, 10 columns...
2021-09-28 16:11:37,028 INFO Processing column 0 'd3mIndex'...
2021-09-28 16:11:37,059 INFO Column type http://schema.org/Integer [http://schema.org/identifier]
2021-09-28 16:11:37,059 INFO Processing column 1 'Age'...
2021-09-28 16:11:37,088 INFO Column type http://schema.org/Integer []
2021-09-28 16:11:37,089 INFO Processing column 2 'Sex'...
2021-09-28 16:11:37,111 INFO Column type http://schema.org/Integer []
2021-09-2

In [27]:
calculate_coverage(manual_patterns, automatic_patterns)

Coverage: 100%


#### Regular dataset:

In [47]:
task_name = 'CLASSIFICATION_TASK'
encoders = ['TEXT_FEATURIZER', 'CATEGORICAL_ENCODER'] 
use_imputer = True
dataset_path = '/Users/rlopez/D3M/datasets/seed_datasets_current/185_baseball_MIN_METADATA/TRAIN/dataset_TRAIN/tables/learningData.csv'
target_column = 'Hall_of_Fame'
task_keywords = ['classification', 'multiClass', 'tabular']

In [48]:
manual_patterns = load_manual_patterns(task_name, encoders, use_imputer)

In [49]:
automatic_patterns = load_automatic_patterns(dataset_path, target_column, task_keywords)

2021-09-29 10:35:06,860 INFO Loading primitives info from file
2021-09-29 10:35:06,875 INFO Loading primitives info from file
2021-09-29 10:35:06,895 INFO Loading pipelines from metalearning database...
2021-09-29 10:35:35,179 INFO Found 917890 pipelines in metalearning database
2021-09-29 10:35:35,181 INFO File size: 119402 bytes
2021-09-29 10:35:35,185 INFO Loading dataframe...
2021-09-29 10:35:35,251 INFO Dataframe loaded, 1073 rows, 19 columns
2021-09-29 10:35:35,252 INFO Setting column names from header
2021-09-29 10:35:35,253 INFO Identifying types, 19 columns...
2021-09-29 10:35:35,255 INFO Processing column 0 'd3mIndex'...
2021-09-29 10:35:35,362 INFO Column type http://schema.org/Integer [http://schema.org/identifier]
2021-09-29 10:35:35,363 INFO Processing column 1 'Player'...
2021-09-29 10:35:35,414 INFO Column type http://schema.org/Text []
2021-09-29 10:35:35,415 INFO Processing column 2 'Number_seasons'...
2021-09-29 10:35:35,503 INFO Column type http://schema.org/Integer

In [23]:
calculate_coverage(manual_patterns, automatic_patterns)

Not found: ['IMPUTATION', 'TEXT_FEATURIZER', 'CATEGORICAL_ENCODER', 'FEATURE_SELECTION', 'CLASSIFICATION']
Coverage: 75%


In [50]:
calculate_coverage(automatic_patterns, manual_patterns)

Not found: ['CLASSIFICATION']
Not found: ['IMPUTATION', 'DIMENSIONALITY_REDUCTION', 'CLASSIFICATION']
Not found: ['FEATURE_SELECTION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'FEATURE_SELECTION', 'CLASSIFICATION']
Not found: ['NUMERICAL_METHOD', 'DATA_CONVERSION', 'CATEGORICAL_ENCODER', 'DATA_CONVERSION', 'IMPUTATION', 'FEATURE_SCALING', 'DATA_CONVERSION', 'FEATURE_SELECTION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'CATEGORICAL_ENCODER', 'DATA_CONVERSION', 'FEATURE_SCALING', 'CLASSIFICATION']
Not found: ['FEATURE_SELECTION', 'IMPUTATION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'CATEGORICAL_ENCODER', 'DIMENSIONALITY_REDUCTION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'CATEGORICAL_ENCODER', 'FEATURE_SELECTION', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'CATEGORICAL_ENCODER', 'FEATURE_SCALING', 'CLASSIFICATION']
Not found: ['IMPUTATION', 'TEXT_FEATURIZER', 'CLASSIFICATION']
Coverage: 25%


# Correlation Between Primitives Usage and Performance

#### Global Correlations:

In [None]:
{
   "DATA_CONVERSION":[
      "(""d3m.primitives.data_transformation.to_numeric.DSBOX",
      0.5471),
      "(""d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX",
      0.4494)
   ],
   "FEATURE_SELECTION":[
      "(""d3m.primitives.feature_selection.rffeatures.Rffeatures",
      0.5631),
      "(""d3m.primitives.feature_selection.variance_threshold.SKlearn",
      0.5476),
      "(""d3m.primitives.feature_selection.select_percentile.SKlearn",
      0.5244),
      "(""d3m.primitives.feature_selection.select_fwe.SKlearn",
      0.455),
      "(""d3m.primitives.feature_selection.joint_mutual_information.AutoRPI",
      0.4189),
      "(""d3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI",
      0.3965)
   ],
   "CLASSIFICATION":[
      "(""d3m.primitives.classification.xgboost_gbtree.Common",
      0.6204),
      "(""d3m.primitives.classification.decision_tree.SKlearn",
      0.5925),
      "(""d3m.primitives.classification.bagging.SKlearn",
      0.5611),
      "(""d3m.primitives.classification.random_forest.Common",
      0.556),
      "(""d3m.primitives.classification.random_forest.SKlearn",
      0.5482),
      "(""d3m.primitives.classification.sgd.SKlearn",
      0.5159),
      "(""d3m.primitives.classification.mlp.SKlearn",
      0.5134),
      "(""d3m.primitives.classification.xgboost_dart.Common",
      0.5101),
      "(""d3m.primitives.classification.search_hybrid.Find_projections",
      0.5088),
      "(""d3m.primitives.classification.light_gbm.Common",
      0.5008),
      "(""d3m.primitives.classification.gradient_boosting.SKlearn",
      0.4941),
      "(""d3m.primitives.classification.linear_discriminant_analysis.SKlearn",
      0.4897),
      "(""d3m.primitives.classification.ada_boost.SKlearn",
      0.4739),
      "(""d3m.primitives.classification.svc.SKlearn",
      0.4723),
      "(""d3m.primitives.classification.nearest_centroid.SKlearn",
      0.4642),
      "(""d3m.primitives.classification.extra_trees.SKlearn",
      0.4471),
      "(""d3m.primitives.classification.k_neighbors.SKlearn",
      0.4405),
      "(""d3m.primitives.classification.logistic_regression.SKlearn",
      0.4085),
      "(""d3m.primitives.classification.linear_svc.SKlearn",
      0.3967)
   ],
   "FEATURE_SCALING":[
      "(""d3m.primitives.data_cleaning.robust_scaler.SKlearn",
      0.7428),
      "(""d3m.primitives.data_cleaning.min_max_scaler.SKlearn",
      0.4961),
      "(""d3m.primitives.data_cleaning.standard_scaler.SKlearn",
      0.483),
      "(""d3m.primitives.normalization.iqr_scaler.DSBOX",
      0.4482),
      "(""d3m.primitives.data_cleaning.quantile_transformer.SKlearn",
      0.3811)
   ],
   "CATEGORICAL_ENCODER":[
      "(""d3m.primitives.data_transformation.one_hot_encoder.DistilOneHotEncoder",
      0.5897),
      "(""d3m.primitives.data_transformation.one_hot_encoder.SKlearn",
      0.5784),
      "(""d3m.primitives.data_transformation.unary_encoder.DSBOX",
      0.545),
      "(""d3m.primitives.data_transformation.encoder.DSBOX",
      0.4897)
   ],
   "TEXT_FEATURIZER":[
      "(""d3m.primitives.data_transformation.encoder.DistilTextEncoder",
      0.5876)
   ],
   "IMPUTATION":[
      "(""d3m.primitives.data_cleaning.imputer.SKlearn",
      0.5957),
      "(""d3m.primitives.data_cleaning.iterative_regression_imputation.DSBOX",
      0.545),
      "(""d3m.primitives.data_preprocessing.random_sampling_imputer.BYU",
      0.4017),
      "(""d3m.primitives.data_cleaning.mean_imputation.DSBOX",
      0.3878)
   ],
   "NUMERICAL_METHOD":[
      "(""d3m.primitives.data_transformation.do_nothing_for_dataset.DSBOX",
      0.4494)
   ],
   "DIMENSIONALITY_REDUCTION":[
      "(""d3m.primitives.feature_selection.pca_features.Pcafeatures",
      0.5582),
      "(""d3m.primitives.feature_extraction.pca.SKlearn",
      0.5185)
   ]
}

#### Local Correlations:

In [None]:
{
"FEATURE_SELECTION IMPUTATION CLASSIFICATION":{
      "CLASSIFICATION":[
         "(""d3m.primitives.classification.random_forest.SKlearn",
         0.6661),
         "(""d3m.primitives.classification.extra_trees.SKlearn",
         0.4399),
         "(""d3m.primitives.classification.gradient_boosting.SKlearn",
         0.3366)
      ],
      "IMPUTATION":[
         "(""d3m.primitives.data_cleaning.imputer.SKlearn",
         1.0)
      ],
      "FEATURE_SELECTION":[
         "(""d3m.primitives.feature_selection.joint_mutual_information.AutoRPI",
         0.6891),
         "(""d3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI",
         0.3109)
      ]
   },
"CLASSIFICATION":{
      "CLASSIFICATION":[
         "(""d3m.primitives.classification.xgboost_gbtree.Common",
         0.9418),
         "(""d3m.primitives.classification.random_forest.Common",
         0.4221),
         "(""d3m.primitives.classification.light_gbm.Common",
         0.0931)
      ]
   }
}

#### Use the Correlations on the MCTS:

In [92]:
IFrame('https://arxiv.org/pdf/1905.10345.pdf', width=1000, height=400)

# Comparing Strategies

In [1]:
import altair as alt
import pandas as pd
from performance_visualizer import load_search_performances, plot_search_performances

In [36]:
dataset = 'LL1_GS_process_classification_tabular_MIN_METADATA'

In [37]:
file_path = 'resource/grammar_manual.json'
manual_performances = load_search_performances(file_path, 'Manual')
plot_search_performances(manual_performances, dataset)

In [38]:
file_path = 'resource/grammar_automatic.json'
automatic_performances = load_search_performances(file_path, 'Automatic')
plot_search_performances(automatic_performances, dataset)

In [39]:
all_performances = pd.concat([manual_performances, automatic_performances], ignore_index=True)
plot_search_performances(all_performances, dataset)

In [63]:
df = all_performances
df = df.replace('185_baseball_MIN_METADATA', 'dataset_1')
df = df.replace('LL0_186_braziltourism_MIN_METADATA', 'dataset_2')
df = df.replace('299_libras_move', 'dataset_3')
df = df.replace('1567_poker_hand_MIN_METADATA', 'dataset_4')
df = df.replace('LL1_GS_process_classification_tabular_MIN_METADATA', 'dataset_5')

bars = alt.Chart().mark_point(filled=True, size=40).encode(
   x=alt.X('method', scale=alt.Scale(zero=True), axis=alt.Axis(grid=False, title=None, labels=False, ticks=False)),
   y=alt.Y('score', axis=alt.Axis(grid=False), aggregate='mean'),
   color=alt.Color('method', scale=alt.Scale(range=['#1976b6', '#e41a1d']),
                   legend=alt.Legend(title='Grammar')),
)

limits = alt.Chart().mark_errorbar(extent='stdev').encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', scale=alt.Scale(zero=False), title='Scores'),
  color=alt.Color('method', scale=alt.Scale(range=['#1976b6', '#e41a1d']))
)

alt.layer(
    bars,
    limits,
    data=df
).facet(
    column=alt.Column('dataset:N', header=alt.Header(title=None, labelOrient='bottom')),
).configure_view(
    strokeWidth=0.0,
    continuousWidth=10,
).configure_title(
    fontSize=11,
    anchor='middle',
    color='black',
    orient='bottom'
).properties(
    title='Datasets'
)

In [64]:
alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_boxplot().encode(
    x=alt.X('method:N'),
    y=alt.Y('score:Q', scale=alt.Scale(zero=False))
).properties(width=300, height=300, title='Media +/- Standard Deviation')

In [41]:
error_bars = alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_errorbar(extent='stdev').encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', scale=alt.Scale(zero=False))
)

points = alt.Chart(all_performances[all_performances['dataset'] == dataset]).mark_point(filled=True).encode(
  x=alt.X('method:N'),
  y=alt.Y('score:Q', aggregate='mean'),
)

error_bars + points

In [29]:
pipelines_counter = all_performances.groupby(['dataset', 'method']).size().reset_index(name='pipelines')

In [21]:
pipelines_counter

Unnamed: 0,dataset,method,pipelines
0,185_baseball_MIN_METADATA,Automatic,52
1,185_baseball_MIN_METADATA,Manual,31
2,LL0_186_braziltourism_MIN_METADATA,Automatic,69
3,LL0_186_braziltourism_MIN_METADATA,Manual,74


In [22]:
bars = alt.Chart(pipelines_counter[pipelines_counter['dataset'] == dataset]).mark_bar().encode(x='method:N', y='pipelines:Q')
text = bars.mark_text(align='center', baseline='middle').encode(text='pipelines:Q')
(bars + text).properties(height=200, title='Number of Pipelines')

In [121]:
def load_automatic_patterns(dataset_path, target_column, task_keywords):
    pipelines = load_related_pipelines(dataset_path, target_column, task_keywords)
    patterns, _ = extract_patterns(pipelines)
    
    return patterns

def load_manual_patterns(task_name, encoders, use_imputer):
    grammar_path = '/Users/rlopez/D3M/alphad3m/resource/base_grammar.bnf'
    task_pattern = ''
    with open(grammar_path) as fin:
        for pattern in fin.readlines():
            if pattern.startswith(task_name + ' -> '):
                task_pattern = pattern.replace(task_name + ' -> ', '').rstrip()
                if not use_imputer:
                    task_pattern = task_pattern.replace('IMPUTATION ', '')
                if len(encoders) == 0:
                    task_pattern = task_pattern.replace('ENCODERS ', '')
                else:
                    task_pattern = task_pattern.replace('ENCODERS', ' '.join(encoders))
                    
    task_pattern = task_pattern.split(' ')
    patterns = [task_pattern]
    empty_elements = ['FEATURE_SCALING', 'FEATURE_SELECTION']
    for empty_element in empty_elements:
        if empty_element in task_pattern:
            new_pattern = [i for i in task_pattern if i != empty_element]
            patterns.append(new_pattern)
    if all(i in task_pattern for i in empty_elements):
        new_pattern = [i for i in task_pattern if i not in empty_elements]
        patterns.append(new_pattern)
        
    return patterns

def calculate_coverage(target_patterns, base_patterns, ordered=False):
    if ordered:
        target_patterns_c, base_patterns_c = target_patterns, base_patterns
    else:
        target_patterns_c, base_patterns_c = [sorted(i) for i in target_patterns], [sorted(i) for i in base_patterns]
    count = 0
    for index, target_pattern in enumerate(target_patterns_c):
        if target_pattern in base_patterns_c:
            count +=1
        else:
            print('Not found:', target_patterns[index])
    total = float(len(target_patterns_c))
    
    print('Coverage: %2.f%%' % (count * 100 / total))