In [1]:
import os
import yaml
import time
import optuna

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from ml_pipeline_nodes_2 import PLAIDTransformedTargetRegressor, WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor
# from ml_pipeline_nodes_2 import PLAIDColumnTransformer

from sklearn.preprocessing import StandardScaler, MinMaxScaler

import numpy as np

import warnings
warnings.filterwarnings('ignore', module='sklearn')

nb_cpus = os.cpu_count()
print("Number of CPUs:", nb_cpus)

  from .autonotebook import tqdm as notebook_tqdm


Number of CPUs: 12


In [None]:

with open("config_2.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]


start = time.time()
hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
# hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:24]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = int(nb_cpus/4))#os.cpu_count())

test_split = prob_def.get_split(global_params['test_split_name'])[:24]
dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = int(nb_cpus/4))#os.cpu_count())

del hf_dataset

Loading dataset from HuggingFace Hub took: 0.4 seconds
Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:10<00:00,  2.29it/s]


Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:12<00:00,  1.90it/s]


In [3]:
preprocessor = Pipeline([
    ('input_scalar_scaler', WrappedPlaidSklearnTransformer(MinMaxScaler(), params = config['input_scalar_scaler'])),
    ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(n_components = config['pca_nodes']['n_components']), params = config['pca_nodes'])),
])
preprocessor

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,sklearn_block,MinMaxScaler()
,params,"{'in_features_identifiers': [{'name': 'angle_in', 'type': 'scalar'}, {'name': 'mach_out', 'type': 'scalar'}], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,sklearn_block,PCA(n_components=3)
,params,"{'in_features_identifiers': [{'base_name': 'Base_2_2', 'type': 'nodes'}], 'n_components': 3, 'out_features_identifiers': [{'name': 'reduced_nodes_*', 'type': 'scalar'}]}"

0,1,2
,n_components,3
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [4]:
postprocessor = Pipeline([
    ('pca_mach', WrappedPlaidSklearnTransformer(PCA(n_components = config['pca_mach']['n_components']), params = config['pca_mach'])),
])
postprocessor

0,1,2
,steps,"[('pca_mach', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,sklearn_block,PCA(n_components=5)
,params,"{'in_features_identifiers': [{'base_name': 'Base_2_2', 'name': 'mach', 'type': 'field'}], 'n_components': 5, 'out_features_identifiers': [{'name': 'reduced_mach_*', 'type': 'scalar'}]}"

0,1,2
,n_components,5
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [5]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
from sklearn.multioutput import MultiOutputRegressor


def sklearn_regressor_constructor(X):

    available_kernel_classes = {
        "Matern":Matern
    }

    options = config['regressor_mach']['options']
    kernel_class = available_kernel_classes[options['kernel']]

    if options["anisotropic"]:
        kernel = ConstantKernel() * kernel_class(length_scale=np.ones(X.shape[1]), length_scale_bounds=(1e-8, 1e8),
                                **options["kernel_options"]) + WhiteKernel(noise_level_bounds=(1e-8, 1e8))
    else:
        kernel = kernel_class(length_scale_bounds=(1e-8, 1e8), **options["kernel_options"]) \
            + WhiteKernel(noise_level_bounds=(1e-8, 1e8))

    gpr = GaussianProcessRegressor(
        kernel=kernel,
        optimizer=options["optim"],
        n_restarts_optimizer=options["num_restarts"],
        random_state=options["random_state"])

    return MultiOutputRegressor(gpr)

regressor = PLAIDTransformedTargetRegressor(
    regressor=WrappedPlaidSklearnRegressor(sklearn_regressor_constructor, params = config['regressor_mach']),
    transformer=postprocessor,
)
regressor

0,1,2
,regressor,WrappedPlaidS...egressor(None)
,transformer,Pipeline(step...ponents=5)))])

0,1,2
,sklearn_block_constructor,<function skl...002C1F7B1FEC0>
,params,"{'in_features_identifiers': [{'name': 'angle_in', 'type': 'scalar'}, {'name': 'mach_out', 'type': 'scalar'}, ...], 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'out_features_identifiers': [{'name': 'reduced_mach_*', 'type': 'scalar'}], 'type': 'GaussianProcessRegressor'}"

0,1,2
,sklearn_block,PCA(n_components=5)
,params,"{'in_features_identifiers': [{'base_name': 'Base_2_2', 'name': 'mach', 'type': 'field'}], 'n_components': 5, 'out_features_identifiers': [{'name': 'reduced_mach_*', 'type': 'scalar'}]}"

0,1,2
,n_components,5
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [6]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,sklearn_block,MinMaxScaler()
,params,"{'in_features_identifiers': [{'name': 'angle_in', 'type': 'scalar'}, {'name': 'mach_out', 'type': 'scalar'}], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,sklearn_block,PCA(n_components=3)
,params,"{'in_features_identifiers': [{'base_name': 'Base_2_2', 'type': 'nodes'}], 'n_components': 3, 'out_features_identifiers': [{'name': 'reduced_nodes_*', 'type': 'scalar'}]}"

0,1,2
,n_components,3
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,regressor,WrappedPlaidS...egressor(None)
,transformer,Pipeline(step...ponents=5)))])

0,1,2
,sklearn_block_constructor,<function skl...002C1F7B1FEC0>
,params,"{'in_features_identifiers': [{'name': 'angle_in', 'type': 'scalar'}, {'name': 'mach_out', 'type': 'scalar'}, ...], 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'out_features_identifiers': [{'name': 'reduced_mach_*', 'type': 'scalar'}], 'type': 'GaussianProcessRegressor'}"

0,1,2
,sklearn_block,PCA(n_components=5)
,params,"{'in_features_identifiers': [{'base_name': 'Base_2_2', 'name': 'mach', 'type': 'field'}], 'n_components': 5, 'out_features_identifiers': [{'name': 'reduced_mach_*', 'type': 'scalar'}]}"

0,1,2
,n_components,5
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [7]:
pipeline.fit(dataset_train)
dataset_pred = pipeline.predict(dataset_train)
dataset_pred[0].get_field("mach", base_name = "Base_2_2")

array([0.37542525, 0.37545782, 0.37549965, ..., 0.79032101, 0.7902825 ,
       0.79024335])

In [8]:
# print(pipeline.get_params())
print(pipeline.get_params()['preprocessor__pca_nodes__sklearn_block__n_components'])
print(pipeline.get_params()['regressor__transformer__pca_mach__sklearn_block__n_components'])

3
5


In [9]:
def objective(trial):
    # Suggest hyperparameters
    nodes_n_components = trial.suggest_int("preprocessor__pca_nodes__sklearn_block__n_components", 2, 5)
    mach_n_components = trial.suggest_int("regressor__transformer__pca_mach__sklearn_block__n_components", 4, 12)

    # Clone and configure pipeline
    pipeline_run = clone(pipeline)
    pipeline_run.set_params(
        preprocessor__pca_nodes__sklearn_block__n_components=nodes_n_components,
        regressor__transformer__pca_mach__sklearn_block__n_components=mach_n_components
    )

    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    scores = []

    indices = np.arange(len(dataset_train))

    for train_idx, val_idx in cv.split(indices):

        dataset_cv_train_ = dataset_train[train_idx]
        dataset_cv_val_   = dataset_train[val_idx]

        pipeline_run.fit(dataset_cv_train_)

        predicted_dataset_cv_val_ = pipeline_run.predict(dataset_cv_val_)

        score = pipeline_run.score(dataset_cv_val_, predicted_dataset_cv_val_)

        scores.append(score)

    return np.mean(scores)


In [10]:
print("dataset[0].get_field() =", dataset_train[0].get_field("mach", base_name="Base_2_2"))

dataset[0].get_field() = [0.39387196 0.39389698 0.39392865 ... 0.81002502 0.81000822 0.80999194]


In [11]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=2)

[I 2025-07-21 15:33:38,310] A new study created in memory with name: no-name-09784b7c-142c-4330-aaaf-47987343a707
[I 2025-07-21 15:33:41,636] Trial 0 finished with value: 0.0703610979780891 and parameters: {'preprocessor__pca_nodes__sklearn_block__n_components': 4, 'regressor__transformer__pca_mach__sklearn_block__n_components': 12}. Best is trial 0 with value: 0.0703610979780891.
[I 2025-07-21 15:33:44,270] Trial 1 finished with value: 0.07039780307349214 and parameters: {'preprocessor__pca_nodes__sklearn_block__n_components': 4, 'regressor__transformer__pca_mach__sklearn_block__n_components': 8}. Best is trial 0 with value: 0.0703610979780891.


In [12]:
params = {'preprocessor__pca_nodes__sklearn_block__n_components': 4, 'regressor__transformer__pca_mach__sklearn_block__n_components': 6}
index = 5
optimized_pipeline = clone(pipeline)
# optimized_pipeline.set_params(**study.best_params)
optimized_pipeline.set_params(**params)
optimized_pipeline.fit(dataset_train)
dataset_pred = optimized_pipeline.predict(dataset_train)
print("mach_pred =", dataset_pred[index].get_field("mach", base_name = "Base_2_2"))
print("mach_ref =", dataset_train[index].get_field("mach", base_name = "Base_2_2"))
print("--")
print("angle_in =", dataset_pred[index].get_scalar("angle_in"))
print("mach_out =", dataset_pred[index].get_scalar("mach_out"))
print("reduced_nodes_* =", [dataset_pred[index].get_scalar("reduced_nodes_*")[j] for j in range(params['preprocessor__pca_nodes__sklearn_block__n_components'])])

mach_pred = [0.3389678  0.33901469 0.3390761  ... 0.7710585  0.77117304 0.77128376]
mach_ref = [0.3411916  0.34125071 0.34132835 ... 0.78406136 0.78400726 0.78394869]
--
angle_in = 0.6386494252873546
mach_out = 0.15684931506849198
reduced_nodes_* = [-31.132563952426977, 26.35027296690032, -0.9765460147618203, -0.9322534183224569]


In [13]:
print(optimized_pipeline.named_steps["regressor"].regressor.sklearn_block.estimators_[0].kernel_.get_params()['k1__k2__length_scale'])#.kernel_.length_scale

[4.57950198e+00 2.42415860e+00 2.50716442e+02 5.97813291e+01
 1.86570867e+05 3.93160802e+01]


In [14]:
dataset_pred_2 = optimized_pipeline.predict(dataset_train)
optimized_pipeline.score(dataset_train, dataset_pred_2)

0.026628778052345453

In [15]:
params = {'pca_nodes__sklearn_block__n_components': 4}
index = 5
preprocessor_ = clone(preprocessor)
preprocessor_.set_params(**params)
dataset_pred = preprocessor_.fit_transform(dataset_train)
print("angle_in =", dataset_pred[index].get_scalar("angle_in"))
print("mach_out =", dataset_pred[index].get_scalar("mach_out"))
print("reduced_nodes_* =", dataset_pred[index].get_scalar("reduced_nodes_*"))

angle_in = 0.6386494252873546
mach_out = 0.15684931506849198
reduced_nodes_* = [-31.13256395  26.35027297  -0.97654601  -0.93225342]


# COMMENTED TESTS

In [16]:
# test = WrappedPlaidSklearnTransformer(MinMaxScaler(), params = config['input_scalar_scaler'])
# test.fit(dataset_train)

In [17]:
# dd = test.transform(dataset_train)

# print(dataset_train)
# print(dd)

# dataset_2 = test.inverse_transform(dd)

# print(dataset_train[0].get_scalar("angle_in"))
# print(dd[0].get_scalar("angle_in"))
# print(dataset_2[0].get_scalar("angle_in"))
# print('===')
# print(dataset_train[0].get_scalar("mach_out"))
# print(dd[0].get_scalar("mach_out"))
# print(dataset_2[0].get_scalar("mach_out"))

In [18]:
# pca = WrappedPlaidSklearnTransformer(PCA(n_components = config['pca_nodes']['n_components']), params = config['pca_nodes'])
# pca.fit(dataset_train)
# dd = pca.transform(dataset_train)
# dataset_2 = pca.inverse_transform(dd)

# print(dataset_train[0].get_nodes(base_name = "Base_2_2"))
# print(dd[0].get_scalar("reduced_nodes_*"))
# print(dataset_2[0].get_nodes(base_name = "Base_2_2"))

In [19]:
# pca2 = WrappedPlaidSklearnTransformer(PCA(n_components = config['pca_mach']['n_components']), params = config['pca_mach'])
# pca2.fit(dataset_train)
# dd = pca2.transform(dataset_train)
# dataset_2 = pca2.inverse_transform(dd)

# print(dataset_train[0].get_field("mach", base_name = "Base_2_2"))
# print(dd[0].get_scalar("reduced_mach_*"))
# print(dataset_2[0].get_field("mach", base_name = "Base_2_2"))