In [1]:

import os
import yaml
import time
import optuna

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import clone
from sklearn.model_selection import KFold

from ml_pipeline_nodes import PLAIDTransformer, ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, DatasetTargetTransformerRegressor
from sklearn.utils import estimator_html_repr

import numpy as np

import warnings
warnings.filterwarnings('ignore', module='sklearn')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
with open("config.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]

print(config['test_0']['features'])
print(config['test_1']['features'])

1./0.

[{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}]


ZeroDivisionError: float division by zero

In [None]:
start = time.time()
hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
# hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:24]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())

test_split = prob_def.get_split(global_params['test_split_name'])[:24]
dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = 24)#os.cpu_count())

del hf_dataset

In [None]:
pipeline = Pipeline([
    ('test_0', PLAIDTransformer(features_param = config['test_0']['feature'])),
    ('test_1', PLAIDTransformer(features_param = config['test_1']['feature'])),
])
pipeline

0,1,2
,steps,"[('test_0', ...), ('test_1', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features_param,"{'name': 'angle_in', 'type': 'scalar'}"

0,1,2
,features_param,"{'base_name': 'Base_2_2', 'name': 'mach', 'type': 'field'}"


In [None]:
test = PLAIDTransformer(features_param = config['test_0']['feature'])
print(test.features_param)
scalars = test.get_features(dataset_train)
print(scalars)

{'type': 'scalar', 'name': 'angle_in'}
{0: 45.5066666666667, 1: 47.8, 2: 46.68, 3: 23.08, 4: 45.72, 5: 34.9733333333333, 6: 34.1466666666667, 7: 23.96, 8: 22.3066666666667, 9: 35.7733333333333, 10: 23.8, 11: 34.3066666666667, 12: 45.6133333333333, 13: 29.4, 14: 29.0533333333333, 15: 29.2666666666667, 16: 11.2666666666667, 17: 15.5333333333333, 18: 16.3333333333333, 19: 15.0, 20: 38.2533333333333, 21: 16.9733333333333, 22: 16.5733333333333, 23: 48.3866666666667}


In [None]:
scalars_doubled = {id:2*scalar for id, scalar in scalars.items()}
print(scalars_doubled)
print(dataset_train.get_sample_ids())
dataset_train_updated = test.set_features(dataset_train, scalars_doubled)
print(scalars)
print(test.get_features(dataset_train))
print(test.get_features(dataset_train_updated))

{0: 91.0133333333334, 1: 95.6, 2: 93.36, 3: 46.16, 4: 91.44, 5: 69.9466666666666, 6: 68.2933333333334, 7: 47.92, 8: 44.6133333333334, 9: 71.5466666666666, 10: 47.6, 11: 68.6133333333334, 12: 91.2266666666666, 13: 58.8, 14: 58.1066666666666, 15: 58.5333333333334, 16: 22.5333333333334, 17: 31.0666666666666, 18: 32.6666666666666, 19: 30.0, 20: 76.5066666666666, 21: 33.9466666666666, 22: 33.1466666666666, 23: 96.7733333333334}
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
{0: 45.5066666666667, 1: 47.8, 2: 46.68, 3: 23.08, 4: 45.72, 5: 34.9733333333333, 6: 34.1466666666667, 7: 23.96, 8: 22.3066666666667, 9: 35.7733333333333, 10: 23.8, 11: 34.3066666666667, 12: 45.6133333333333, 13: 29.4, 14: 29.0533333333333, 15: 29.2666666666667, 16: 11.2666666666667, 17: 15.5333333333333, 18: 16.3333333333333, 19: 15.0, 20: 38.2533333333333, 21: 16.9733333333333, 22: 16.5733333333333, 23: 48.3866666666667}
{0: 45.5066666666667, 1: 47.8, 2: 46.68, 3: 23.08, 4: 45.72

In [None]:
test2 = PLAIDTransformer(features_param = config['test_1']['feature'])
print(test2.features_param)
fields = test2.get_features(dataset_train)
print(fields)

{'type': 'field', 'name': 'mach', 'base_name': 'Base_2_2'}
{0: array([0.39387196, 0.39389698, 0.39392865, ..., 0.81002502, 0.81000822,
       0.80999194]), 1: array([0.33747828, 0.33749296, 0.33751143, ..., 0.85239712, 0.85217787,
       0.8519595 ]), 2: array([0.36552629, 0.36554822, 0.36557605, ..., 0.96237702, 0.96228836,
       0.96219921]), 3: array([0.34581938, 0.34585912, 0.34591116, ..., 0.87625769, 0.87625867,
       0.87625958]), 4: array([0.37936991, 0.37941227, 0.3794666 , ..., 0.96100806, 0.96101001,
       0.96101222]), 5: array([0.3411916 , 0.34125071, 0.34132835, ..., 0.78406136, 0.78400726,
       0.78394869]), 6: array([0.35181218, 0.35185764, 0.3519167 , ..., 0.77420869, 0.77421349,
       0.77421835]), 7: array([0.24698725, 0.2469949 , 0.24700469, ..., 0.84297804, 0.8427338 ,
       0.84249109]), 8: array([0.33178397, 0.33181125, 0.33184664, ..., 0.92125916, 0.92126807,
       0.92127686]), 9: array([0.30190886, 0.30194145, 0.30198381, ..., 0.89635259, 0.89610321,
 

In [None]:
fields_doubled = {id:2*field for id, field in fields.items()}
print(fields_doubled)
print(dataset_train.get_sample_ids())
dataset_train_updated2 = test2.set_features(dataset_train, fields_doubled)
print(scalars)
print(test2.get_features(dataset_train))
print(test2.get_features(dataset_train_updated2))

{0: array([0.78774392, 0.78779397, 0.7878573 , ..., 1.62005004, 1.62001645,
       1.61998388]), 1: array([0.67495657, 0.67498591, 0.67502285, ..., 1.70479425, 1.70435575,
       1.70391899]), 2: array([0.73105258, 0.73109644, 0.7311521 , ..., 1.92475403, 1.92457672,
       1.92439843]), 3: array([0.69163876, 0.69171823, 0.69182232, ..., 1.75251538, 1.75251733,
       1.75251916]), 4: array([0.75873982, 0.75882454, 0.7589332 , ..., 1.92201612, 1.92202001,
       1.92202444]), 5: array([0.68238319, 0.68250142, 0.68265671, ..., 1.56812272, 1.56801452,
       1.56789738]), 6: array([0.70362436, 0.70371529, 0.7038334 , ..., 1.54841739, 1.54842698,
       1.5484367 ]), 7: array([0.49397451, 0.4939898 , 0.49400938, ..., 1.68595608, 1.6854676 ,
       1.68498217]), 8: array([0.66356794, 0.6636225 , 0.66369329, ..., 1.84251831, 1.84253614,
       1.84255372]), 9: array([0.60381772, 0.60388291, 0.60396762, ..., 1.79270518, 1.79220642,
       1.79170926]), 10: array([0.51125158, 0.51128046, 0.51