In [1]:

import os
import yaml
import time

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor

from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, DatasetTargetTransformerRegressor
from sklearn.utils import estimator_html_repr


import warnings
warnings.filterwarnings('ignore', module='sklearn')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open("config.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]


start = time.time()
# hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:100]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())

Loading dataset from HuggingFace Hub took: 0.055 seconds
Converting huggingface dataset to plaid dataset...


100%|██████████| 100/100 [00:00<00:00, 478.59it/s]


In [3]:
from sklearn.decomposition import PCA
feats_to_reduce = list(range(8))
preprocessor = ColumnTransformer(
    transformers=[
        (
            "pca",
            PCA(n_components=8),
            feats_to_reduce,
        ),
    ],
    remainder="passthrough",
)
preprocessor

0,1,2
,transformers,"[('pca', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_components,8
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [4]:
preprocessor = Pipeline([
    ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),
    ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),
])
preprocessor

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'scalar_names': ['angle_in', 'mach_out'], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3


In [5]:
postprocessor = Pipeline(
    [
    ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),
    ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),
    ]
)
postprocessor

0,1,2
,steps,"[('output_scalar_scaler', ...), ('pca_mach', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'scalar_names': ['Q', 'power', ...], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [6]:
regressor = DatasetTargetTransformerRegressor(
    regressor=GPRegressorNode(params = config['regressor_mach']),
    transformer=postprocessor,
)
regressor

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'scalar_names': ['Q', 'power', ...], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [7]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'scalar_names': ['angle_in', 'mach_out'], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'scalar_names': ['Q', 'power', ...], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [8]:
pipeline.fit(dataset_train, dataset_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'scalar_names': ['angle_in', 'mach_out'], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'scalar_names': ['Q', 'power', ...], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [9]:
# pipeline.get_params()

In [10]:
print(pipeline.get_params()['preprocessor__pca_nodes__n_components'])
print(pipeline.get_params()['regressor__transformer__pca_mach__n_components'])

3
5


In [None]:

param_grid = {
    'preprocessor__pca_nodes__n_components': [2, 3],
    'regressor__transformer__pca_mach__n_components': [4, 5],
}

# Run GridSearchCV
search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, verbose=3, error_score='raise')
search.fit(dataset_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits




[CV 1/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   0.9s




[CV 2/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   1.0s




[CV 3/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   0.9s




[CV 1/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   0.9s




[CV 2/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   1.1s




[CV 3/3] END preprocessor__pca_nodes__n_components=2, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   0.9s




[CV 1/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   0.9s




[CV 2/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   0.9s




[CV 3/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=4;, score=nan total time=   0.8s




[CV 1/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   0.9s




[CV 2/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   1.0s




[CV 3/3] END preprocessor__pca_nodes__n_components=3, regressor__transformer__pca_mach__n_components=5;, score=nan total time=   0.8s


0,1,2
,estimator,Pipeline(step...s': 5}))])))])
,param_grid,"{'preprocessor__pca_nodes__n_components': [2, 3], 'regressor__transformer__pca_mach__n_components': [4, 5]}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'scalar_names': ['angle_in', 'mach_out'], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,2

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'scalar_names': ['Q', 'power', ...], 'type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,4
