In [1]:

import os
import yaml
import time
import optuna

from datasets import load_dataset, load_from_disk
from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import clone
from sklearn.model_selection import KFold

from ml_pipeline_nodes import ScalarScalerNode, GPRegressorNode, PCAEmbeddingNode, DatasetTargetTransformerRegressor
from sklearn.utils import estimator_html_repr

import numpy as np

import warnings
warnings.filterwarnings('ignore', module='sklearn')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

with open("config.yml") as f:
    config = yaml.safe_load(f)

global_params = config["global"]


start = time.time()
hf_dataset = load_dataset(global_params['dataset_path'], split="all_samples")
# hf_dataset = load_from_disk(global_params['dataset_path'])
print(f"Loading dataset from HuggingFace Hub took: {time.time() - start:.2g} seconds")

prob_def = huggingface_description_to_problem_definition(hf_dataset.description)

train_split = prob_def.get_split(global_params['train_split_name'])[:24]
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, ids = train_split, processes_number = 24)#os.cpu_count())

test_split = prob_def.get_split(global_params['test_split_name'])
dataset_test, _ = huggingface_dataset_to_plaid(hf_dataset, ids = test_split, processes_number = 24)#os.cpu_count())

del hf_dataset

Loading dataset from HuggingFace Hub took: 2.7 seconds
Converting huggingface dataset to plaid dataset...


100%|██████████| 24/24 [00:01<00:00, 17.73it/s]

Converting huggingface dataset to plaid dataset...



100%|██████████| 168/168 [00:04<00:00, 39.58it/s]


In [3]:
from sklearn.decomposition import PCA
feats_to_reduce = list(range(8))
preprocessor = ColumnTransformer(
    transformers=[
        (
            "pca",
            PCA(n_components=8),
            feats_to_reduce,
        ),
    ],
    remainder="passthrough",
)
preprocessor

0,1,2
,transformers,"[('pca', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,n_components,8
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [4]:
preprocessor = Pipeline([
    ('input_scalar_scaler', ScalarScalerNode(params = config['input_scalar_scaler'])),
    ('pca_nodes', PCAEmbeddingNode(params = config['pca_nodes'], n_components = config['pca_nodes']['n_components'])),
])
preprocessor

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3


In [5]:
postprocessor = Pipeline(
    [
    ('output_scalar_scaler', ScalarScalerNode(params = config['output_scalar_scaler'])),
    ('pca_mach', PCAEmbeddingNode(params = config['pca_mach'], n_components = config['pca_mach']['n_components'])),
    ]
)
postprocessor

0,1,2
,steps,"[('output_scalar_scaler', ...), ('pca_mach', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [6]:
regressor = DatasetTargetTransformerRegressor(
    regressor=GPRegressorNode(params = config['regressor_mach']),
    transformer=postprocessor,
)
regressor

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [7]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", regressor),
    ]
)
pipeline

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,3

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,5


In [8]:
print(pipeline.get_params()['preprocessor__pca_nodes__n_components'])
print(pipeline.get_params()['regressor__transformer__pca_mach__n_components'])

3
5


In [9]:
def objective(trial):
    # Suggest hyperparameters
    nodes_n_components = trial.suggest_int("preprocessor__pca_nodes__n_components", 2, 5)
    mach_n_components = trial.suggest_int("regressor__transformer__pca_mach__n_components", 4, 12)

    # Clone and configure pipeline
    pipeline_run = clone(pipeline)
    pipeline_run.set_params(
        preprocessor__pca_nodes__n_components=nodes_n_components,
        regressor__transformer__pca_mach__n_components=mach_n_components
    )

    cv = KFold(n_splits=3, shuffle=True, random_state=42)

    scores = []

    indices = np.arange(len(dataset_train))

    for train_idx, val_idx in cv.split(indices):

        dataset_cv_train_ = dataset_train[train_idx]
        dataset_cv_val_   = dataset_train[val_idx]

        pipeline_run.fit(dataset_cv_train_)

        predicted_dataset_cv_val_ = pipeline_run.predict(dataset_cv_val_)

        score = pipeline_run.score(dataset_cv_val_, predicted_dataset_cv_val_)

        scores.append(score)

    return np.mean(scores)


In [10]:
print("dataset[0].get_field() =", dataset_train[0].get_field("mach", base_name="Base_2_2"))

dataset[0].get_field() = [0.39387196 0.39389698 0.39392865 ... 0.81002502 0.81000822 0.80999194]


In [11]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=2)

[I 2025-07-19 19:46:38,601] A new study created in memory with name: no-name-52f6e941-54fc-4916-8787-2f78c488e30e
[I 2025-07-19 19:46:42,342] Trial 0 finished with value: 0.07254803556807561 and parameters: {'preprocessor__pca_nodes__n_components': 5, 'regressor__transformer__pca_mach__n_components': 11}. Best is trial 0 with value: 0.07254803556807561.
[I 2025-07-19 19:46:45,207] Trial 1 finished with value: 0.07039775520839063 and parameters: {'preprocessor__pca_nodes__n_components': 4, 'regressor__transformer__pca_mach__n_components': 8}. Best is trial 1 with value: 0.07039775520839063.


In [12]:
optimized_pipeline = clone(pipeline)
optimized_pipeline.set_params(**study.best_params)
optimized_pipeline.fit(dataset_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('input_scalar_scaler', ...), ('pca_nodes', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,params,"{'feature_names': ['angle_in', 'mach_out'], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'nodes', 'n_components': 3}"
,n_components,4

0,1,2
,regressor,GPRegressorNo...ssRegressor'})
,transformer,Pipeline(step...nents': 5}))])

0,1,2
,params,"{'input': {'scalar_names': ['angle_in', 'mach_out'], 'vector_names': ['reduced_nodes']}, 'options': {'anisotropic': True, 'kernel': 'Matern', 'kernel_options': {'nu': 2.5}, 'num_restarts': 2, ...}, 'output': {'vector_names': ['reduced_mach']}, 'type': 'GaussianProcessRegressor'}"

0,1,2
,params,"{'feature_names': ['Q', 'power', ...], 'scaler_type': 'MinMaxScaler'}"

0,1,2
,params,"{'base_name': 'Base_2_2', 'field_name': 'mach', 'n_components': 5}"
,n_components,8


In [13]:
dataset_pred = optimized_pipeline.predict(dataset_test)
print(dataset_pred[0].get_field("mach", base_name = "Base_2_2"))

[0.3254099  0.32544421 0.32548872 ... 0.83439263 0.83426845 0.83414451]


In [14]:
dataset_pred_2 = optimized_pipeline.predict(dataset_train)
optimized_pipeline.score(dataset_train, dataset_pred_2)

0.020513075924941802