In [1]:

import os
import yaml
import time
import optuna

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from ml_pipeline_nodes_2 import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, PLAIDTransformedTargetRegressor, PLAIDColumnTransformer, PlaidSklearnTransformWrapper
from sklearn.utils import estimator_html_repr

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import numpy as np

import warnings
warnings.filterwarnings('ignore', module='sklearn')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open("config_2.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]


start = time.time()
hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
# hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:24]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())

test_split = prob_def.get_split(global_params['test_split_name'])[:24]
dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = 24)#os.cpu_count())

del hf_dataset

Loading dataset from HuggingFace Hub took: 2.2 seconds
Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:01<00:00, 19.72it/s]

Converting huggingface dataset to plaid dataset...



100%|██████████| 24/24 [00:00<00:00, 34.64it/s]


In [3]:
test = PlaidSklearnTransformWrapper(MinMaxScaler(), params = config['input_scalar_scaler'])
test.fit(dataset_train)
test

0,1,2
,sklearn_block,MinMaxScaler()
,params,"{'in_features_identifiers': [{'name': 'angle_in', 'type': 'scalar'}, {'name': 'mach_out', 'type': 'scalar'}], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [4]:
dd = test.transform(dataset_train)

print(dataset_train)
print(dd)

dataset_2 = test.inverse_transform(dd)

print(dataset_train[0].get_scalar("angle_in"))
print(dd[0].get_scalar("angle_in"))
print(dataset_2[0].get_scalar("angle_in"))

X_transformed = [[0.92241379 0.26369863]
 [0.9841954  0.63630137]
 [0.95402299 0.95958904]
 [0.31824713 0.55958904]
 [0.92816092 0.96027397]
 [0.63864943 0.15684932]
 [0.61637931 0.10342466]
 [0.34195402 0.61164384]
 [0.29741379 0.76369863]
 [0.66020115 0.81849315]
 [0.33764368 0.68287671]
 [0.62068966 0.        ]
 [0.92528736 0.85205479]
 [0.48850575 0.43082192]
 [0.47916667 0.76643836]
 [0.48491379 1.        ]
 [0.         0.34520548]
 [0.11494253 0.93561644]
 [0.13649425 0.75616438]
 [0.10057471 0.7609589 ]
 [0.72701149 0.36232877]
 [0.15373563 0.10616438]
 [0.14295977 0.10890411]
 [1.         0.59589041]] (24, 2)


AttributeError: 'PlaidSklearnTransformWrapper' object has no attribute 'in_features_identifier'

In [None]:
int_scal = ScalarScalerNode(params = config['input_scalar_scaler'])
int_scal.fit(dataset_train)
dd = int_scal.transform(dataset_train)
dataset_2 = int_scal.inverse_transform(dd)

print(dataset_train[0].get_scalar("angle_in"))
print(dd[0].get_scalar("angle_in"))
print(dataset_2[0].get_scalar("angle_in"))

feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.92241379 0.26369863]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.9841954  0.63630137]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.95402299 0.95958904]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.31824713 0.55958904]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.92816092 0.96027397]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.63864943 0.15684932]
feature_identifiers, features[id] = [{'type': 'scalar', 'name': 'angle_in'}, {'type': 'scalar', 'name': 'mach_out'}] [0.61637931 0.10342466]
feature_ident

In [None]:
pca = PlaidSklearnTransformWrapper(PCA(n_components = config['pca_nodes']['n_components']), params = config['pca_nodes'])
pca.fit(dataset_train)
dd = pca.transform(dataset_train)
# dataset_2 = pca.inverse_transform(dd)

print(dataset_train[0].get_nodes(base_name = "Base_2_2"))
print(dd[0].get_scalar("reduced_nodes_*"))
# print(dataset_2[0].get_nodes(base_name = "Base_2_2"))

X_.shape = (24, 1, 72842)
X_transformed = [[ 10.31073591 -15.88660491   0.92208477]
 [-13.7963278  -15.27100515   2.11542326]
 [  4.057714   -11.54493442   2.85894598]
 [ 24.57478102  11.4062509   -0.95613333]
 [-22.30623273   2.08888051   0.34767748]
 [-31.13256395  26.35027297  -0.97654601]
 [ 15.65949984   7.2960071    0.19731855]
 [  9.73760827  -9.13312963   0.78711268]
 [ 15.38885453   2.48955557  -1.79519196]
 [ -2.04131811  12.21830161   2.17934148]
 [-29.405104     7.22812868  -1.42930117]
 [ 12.17746223  10.33976245   2.21941646]
 [-32.08856184   8.22728802   2.49105522]
 [  3.47741558  -9.86059058  -0.29224886]
 [ 30.00011818  16.27188505   1.11756669]
 [-43.60751317   1.56954109  -2.06398987]
 [-10.19988578  -8.53729419  -2.85098959]
 [ 18.85277279  15.10003244  -1.51598828]
 [-10.65884417 -16.45497099  -0.58976676]
 [ 32.1992026    9.09340093  -1.21907649]
 [  1.57614443 -16.91871528   0.27230488]
 [ 25.89302238  -8.44149951  -0.99407859]
 [  3.24372937 -19.23980468  -2.20

In [None]:
preprocessor = PLAIDColumnTransformer([
    ('input_scalar_scaler', PlaidSklearnTransformWrapper(MinMaxScaler(), params = config['input_scalar_scaler'])),
    ('pca_nodes', PlaidSklearnTransformWrapper(PCA(n_components = config['pca_nodes']['n_components']), params = config['pca_nodes'])),
])
preprocessor

0,1,2
,transformers,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_components,3
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [None]:
# preprocessor = Pipeline([
#     ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),
#     ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),
# ])
preprocessor = PLAIDColumnTransformer([
    ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),
    ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),
])
preprocessor

KeyError: 'field_name'

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"


In [None]:
preprocessor.fit(dataset_train)

AssertionError: feature type not specified in feature_identifier

In [None]:
# postprocessor = Pipeline(
#     [
#     ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),
#     ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),
#     ]
# )
postprocessor = PLAIDColumnTransformer([
    ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),
    ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),
])

postprocessor

0,1,2
,transformers,"[('output_scalar_scaler', ...), ('pca_mach', ...)]"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [None]:
regressor = PLAIDTransformedTargetRegressor(
    regressor=GPRegressorNode(params = config['regressor_mach']),
    transformer=postprocessor,
)
regressor

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,PLAIDColumnTr..._features>')])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,PLAIDColumnTr..._features>')])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [None]:
print(pipeline.get_params()['preprocessor__pca_nodes__n_components'])
print(pipeline.get_params()['regressor__transformer__pca_mach__n_components'])

3
5


In [None]:
def objective(trial):
    # Suggest hyperparameters
    nodes_n_components = trial.suggest_int("preprocessor__pca_nodes__n_components", 2, 5)
    mach_n_components = trial.suggest_int("regressor__transformer__pca_mach__n_components", 4, 12)

    # Clone and configure pipeline
    pipeline_run = clone(pipeline)
    pipeline_run.set_params(
        preprocessor__pca_nodes__n_components=nodes_n_components,
        regressor__transformer__pca_mach__n_components=mach_n_components
    )

    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    scores = []

    indices = np.arange(len(dataset_train))

    for train_idx, val_idx in cv.split(indices):

        dataset_cv_train_ = dataset_train[train_idx]
        dataset_cv_val_   = dataset_train[val_idx]

        pipeline_run.fit(dataset_cv_train_)

        predicted_dataset_cv_val_ = pipeline_run.predict(dataset_cv_val_)

        score = pipeline_run.score(dataset_cv_val_, predicted_dataset_cv_val_)

        scores.append(score)

    return np.mean(scores)


In [None]:
print("dataset[0].get_field() =", dataset_train[0].get_field("mach", base_name="Base_2_2"))

dataset[0].get_field() = [0.39387196 0.39389698 0.39392865 ... 0.81002502 0.81000822 0.80999194]


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=2)

[I 2025-07-20 09:57:17,358] A new study created in memory with name: no-name-17da65ca-e1b6-4b3b-be5b-c227eedd7a79
[W 2025-07-20 09:57:17,361] Trial 0 failed with parameters: {'preprocessor__pca_nodes__n_components': 3, 'regressor__transformer__pca_mach__n_components': 12} because of the following error: ValueError('too many values to unpack (expected 2)').
Traceback (most recent call last):
  File "/home/fabien/miniconda3/envs/plaid-dev/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_8095/1440474491.py", line 7, in objective
    pipeline_run = clone(pipeline)
                   ^^^^^^^^^^^^^^^
  File "/home/fabien/miniconda3/envs/plaid-dev/lib/python3.11/site-packages/sklearn/base.py", line 94, in clone
    return estimator.__sklearn_clone__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/fabien/miniconda3/envs/plaid-dev/lib/python3.11/site-packages/sklea

ValueError: too many values to unpack (expected 2)

In [None]:
optimized_pipeline = clone(pipeline)
optimized_pipeline.set_params(**study.best_params)
optimized_pipeline.fit(dataset_train)

In [None]:
dataset_pred = optimized_pipeline.predict(dataset_test)
print(dataset_pred[0].get_field("mach", base_name = "Base_2_2"))

In [None]:
dataset_pred_2 = optimized_pipeline.predict(dataset_train)
optimized_pipeline.score(dataset_train, dataset_pred_2)