In [1]:

import os
import yaml
import time
import optuna

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.model_selection import KFold

from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, PLAIDTransformedTargetRegressor

import numpy as np

import warnings
warnings.filterwarnings('ignore', module='sklearn')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open("config.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]


start = time.time()
hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
# hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:24]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())

test_split = prob_def.get_split(global_params['test_split_name'])[:24]
dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = 24)#os.cpu_count())

del hf_dataset

Loading dataset from HuggingFace Hub took: 2.3 seconds
Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:01<00:00, 17.12it/s]


Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:00<00:00, 30.86it/s]


In [3]:
preprocessor = Pipeline([
    ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),
    ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),
])
preprocessor

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3


In [4]:
postprocessor = Pipeline(
    [
    # ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),
    ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),
    ]
)

postprocessor

0,1,2
,steps,"[('pca_mach', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [5]:
regressor = PLAIDTransformedTargetRegressor(
    regressor=GPRegressorNode(params = config['regressor_mach']),
    transformer=postprocessor,
)
regressor

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [6]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [7]:
print(pipeline.get_params()['preprocessor__pca_nodes__n_components'])
print(pipeline.get_params()['regressor__transformer__pca_mach__n_components'])

3
5


In [8]:
def objective(trial):
    # Suggest hyperparameters
    nodes_n_components = trial.suggest_int("preprocessor__pca_nodes__n_components", 2, 5)
    mach_n_components = trial.suggest_int("regressor__transformer__pca_mach__n_components", 4, 12)

    # Clone and configure pipeline
    pipeline_run = clone(pipeline)
    pipeline_run.set_params(
        preprocessor__pca_nodes__n_components=nodes_n_components,
        regressor__transformer__pca_mach__n_components=mach_n_components
    )

    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    scores = []

    indices = np.arange(len(dataset_train))

    for train_idx, val_idx in cv.split(indices):

        dataset_cv_train_ = dataset_train[train_idx]
        dataset_cv_val_   = dataset_train[val_idx]

        pipeline_run.fit(dataset_cv_train_)

        predicted_dataset_cv_val_ = pipeline_run.predict(dataset_cv_val_)

        score = pipeline_run.score(dataset_cv_val_, predicted_dataset_cv_val_)

        scores.append(score)

    return np.mean(scores)


In [9]:
print("dataset[0].get_field() =", dataset_train[0].get_field("mach", base_name="Base_2_2"))

dataset[0].get_field() = [0.39387196 0.39389698 0.39392865 ... 0.81002502 0.81000822 0.80999194]


In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=2)

[I 2025-07-20 21:39:36,505] A new study created in memory with name: no-name-d2d0fd23-5c0b-4d1c-9bb0-fe849cb28443
[I 2025-07-20 21:39:41,993] Trial 0 finished with value: 0.06984716312776772 and parameters: {'preprocessor__pca_nodes__n_components': 4, 'regressor__transformer__pca_mach__n_components': 4}. Best is trial 0 with value: 0.06984716312776772.
[I 2025-07-20 21:39:50,454] Trial 1 finished with value: 0.07562731608857037 and parameters: {'preprocessor__pca_nodes__n_components': 2, 'regressor__transformer__pca_mach__n_components': 11}. Best is trial 0 with value: 0.06984716312776772.


In [11]:
params = {'preprocessor__pca_nodes__n_components': 4, 'regressor__transformer__pca_mach__n_components': 6}
index = 5
optimized_pipeline = clone(pipeline)
# optimized_pipeline.set_params(**study.best_params)
optimized_pipeline.set_params(**params)
optimized_pipeline.fit(dataset_train)
dataset_pred = optimized_pipeline.predict(dataset_train)
print("mach_pred =", dataset_pred[index].get_field("mach", base_name = "Base_2_2"))
print("mach_ref =", dataset_train[index].get_field("mach", base_name = "Base_2_2"))
print("--")
print("angle_in =", dataset_pred[index].get_scalar("angle_in"))
print("mach_out =", dataset_pred[index].get_scalar("mach_out"))
print("reduced_nodes_* =", [dataset_pred[index].get_scalar(f"reduced_nodes_{j}") for j in range(params['preprocessor__pca_nodes__n_components'])])

mach_pred = [0.3389678  0.33901468 0.33907609 ... 0.77105852 0.77117306 0.77128378]
mach_ref = [0.3411916  0.34125071 0.34132835 ... 0.78406136 0.78400726 0.78394869]
--
angle_in = 0.6386494252873546
mach_out = 0.15684931506849198
reduced_nodes_* = [-31.13256395242712, 26.35027296690056, -0.9765460147620642, -0.9322534183222402]


In [12]:
print(optimized_pipeline.named_steps["regressor"].regressor.model.estimators_[0].kernel_.get_params()['k1__k2__length_scale'])#.kernel_.length_scale

[4.57950198e+00 2.42415860e+00 2.50716442e+02 5.97813291e+01
 1.86570867e+05 3.93160802e+01]


In [13]:
dataset_pred_2 = optimized_pipeline.predict(dataset_train)
optimized_pipeline.score(dataset_train, dataset_pred_2)

0.02662877806105311

In [14]:
params = {'pca_nodes__n_components': 4}
index = 5
preprocessor_ = clone(preprocessor)
preprocessor_.set_params(**params)
dataset_pred = preprocessor_.fit_transform(dataset_train)
print("angle_in =", dataset_pred[index].get_scalar("angle_in"))
print("mach_out =", dataset_pred[index].get_scalar("mach_out"))
print("reduced_nodes_* =", [dataset_pred[index].get_scalar(f"reduced_nodes_{j}") for j in range(params['pca_nodes__n_components'])])

angle_in = 0.6386494252873546
mach_out = 0.15684931506849198
reduced_nodes_* = [-31.132563952427148, 26.350272966900548, -0.9765460147620555, -0.932253418322242]
