# Pipeline Examples

This notebook demonstrates the end-to-end process of building a machine learning pipeline using PLAID datasets and PLAID’s scikit-learn-compatible blocks.

### 📦 Imports

In [None]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')
warnings.filterwarnings("ignore", message=".*IProgress not found.*")

import os
from pathlib import Path

import yaml
import numpy as np
import optuna

from datasets.utils.logging import disable_progress_bar
from datasets import load_dataset

from sklearn.base import clone
from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.multioutput import MultiOutputRegressor

from sklearn.model_selection import KFold, GridSearchCV

from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor
from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer

disable_progress_bar()
n_processes = min(max(1, os.cpu_count()), 6)

## MMGP for `U1` field prediction of `Tensile2d` dataset

Key steps covered:

- **Loading and preparing the PLAID dataset** using Hugging Face integration and PLAID’s dataset classes  
- **Standardizing features** with PLAID-wrapped scikit-learn transformers for scalars and fields  
- **Dimensionality reduction** of flow fields via Principal Component Analysis (PCA) to reduce output complexity  
- **Regression modeling** of PCA coefficients from scalar inputs using Gaussian Process regression  
- **Pipeline assembly** combining transformations and regressors into a single scikit-learn-compatible workflow  
- **Hyperparameter tuning** using Optuna and scikit-learn’s `GridSearchCV`
- **Model evaluation** using cross-validation and appropriate metrics  
- **Best practices** for working with PLAID datasets and pipelines in a reproducible and modular manner

### 📥 Load Dataset

We load the `Tensile2d` dataset from Hugging Face and restrict ourselves to the first 24 samples of the training set.

In [None]:
hf_dataset = load_dataset("PLAID-datasets/Tensile2d", split="all_samples[:8]")
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, processes_number = 4, verbose = False)

In [None]:
print("dataset_train =", dataset_train)

In [None]:
try:
    filename = Path(__file__).parent.parent.parent / "docs" / "source" / "notebooks" / "config_pipeline2.yml"
except NameError:
    filename = "config_pipeline2.yml"

with open(filename, 'r') as f:
    config = yaml.safe_load(f)

all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\
    config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers']

In [None]:
dataset_train = dataset_train.from_features_identifier(all_feature_id)
print("dataset_train:", dataset_train)
print("scalar names =", dataset_train.get_scalar_names())
print("field names =", dataset_train.get_field_names())

In [None]:
from MMGP_node import MMGPPreparer, MMGPTransformer
# mmgp_preparator = MMGPPreparer(common_mesh_id = 0)

In [None]:
# mmgp_preparator.fit(dataset_train)

In [None]:
# transformed_dataset = mmgp_preparator.transform(dataset_train)
# print(transformed_dataset)
# print(transformed_dataset[0]._extra_data)


In [None]:
# inv_transformed_dataset = mmgp_preparator.inverse_transform(transformed_dataset)
# print(inv_transformed_dataset[0]._extra_data)

In [None]:
# mmnt = MMGPTransformer(**config['mmgp_nodes_transf'])
# mmft = MMGPTransformer(**config['mmgp_u1_transf'])

In [None]:
# mmnt.fit(transformed_dataset)
# mmft.fit(transformed_dataset)
# transformed_dataset2 = mmnt.transform(transformed_dataset)
# transformed_dataset3 = mmft.fit_transform(transformed_dataset)

In [None]:
# print(transformed_dataset2[1].get_nodes().shape)
# print(transformed_dataset3[1].get_field("U1").shape)

In [None]:
# invtranfdataset2 = mmnt.inverse_transform(transformed_dataset2)
# invtranfdataset3 = mmft.inverse_transform(transformed_dataset3)

In [None]:
# print(invtranfdataset2[2].get_nodes().shape)
# print(invtranfdataset3[2].get_field("U1").shape)

In [None]:
# test_pipeline = Pipeline(
#     steps=[
#         ("mmgp_prep", mm),
#         ("mmgp_transf_nodes", mmnt),
#     ]
# )
# test_pipeline

In [None]:
# transformed_dataset[0].get_field("U1")
# 1./0.

In [None]:
# from Muscat.Bridges.CGNSBridge import CGNSToMesh

# print(transformed_dataset)
# index = 1
# print(transformed_dataset[index].get_all_mesh_times())
# print(transformed_dataset[index].get_field("coords_X", time = 0.).shape)
# print(transformed_dataset.extra_data[index].get_field("coords_X").shape)
# print(transformed_dataset[index].get_field("U1", time = 0.).shape)
# print(transformed_dataset.extra_data[index].get_field("U1"))
# mesh = CGNSToMesh(transformed_dataset[index].get_mesh(0.))
# mesh_o = CGNSToMesh(transformed_dataset.extra_data[index].get_mesh())
# print(mesh)
# print(mesh_o)
# print(">>", transformed_dataset[0].get_field("U1", time = 0.).shape)
# print(">>", transformed_dataset[1].get_field("U1", time = 0.).shape)
# print(">>", transformed_dataset[0].get_nodes().shape)
# print(">>", transformed_dataset[1].get_nodes().shape)


In [None]:
# print(CGNSToMesh(dataset_train[0].get_mesh(0.)))
# print(CGNSToMesh(dataset_train[1].get_mesh(0.)))

In [None]:
# inv_transformed_dataset = mm.inverse_transform(transformed_dataset)

In [None]:
# index = 0
# print(inv_transformed_dataset[index].get_scalar('P'))
# print(dataset_train[index].get_scalar('P'))
# print("==")
# print(np.linalg.norm(inv_transformed_dataset[index].get_nodes()-dataset_train[index].get_nodes()))
# print("==")
# print(inv_transformed_dataset[index].get_field("U1"))
# print(dataset_train[index].get_field("U1"))
# print(np.linalg.norm(inv_transformed_dataset[index].get_field("U1")-dataset_train[index].get_field("U1")))


In [None]:

# preparator
# Pipeline(
#     steps=[
#         ("mmgp_preparator", MMGPPreparation(common_mesh_id = 0)),
#         ('mmgp_nodes_transf', MMGPTransformer(**config['mmgp_nodes_transf'])),
#     ]
# )

In [None]:
preparator = MMGPPreparer(common_mesh_id = 1)

input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])

nodes_preprocessor = Pipeline(
    steps=[
        ("mmgp_nodes_transf", MMGPTransformer(**config['mmgp_nodes_transf'])),
        ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(n_components=4), **config['pca_nodes'])),
    ]
)

column_preprocessor = PlaidColumnTransformer(
                [
                    ('input_scalar_scaler', input_scalar_scaler),
                    ('nodes_preprocessor', nodes_preprocessor),
                ]
            )
# column_preprocessor = PlaidColumnTransformer(
#                 [
#                     ('input_scalar_scaler', input_scalar_scaler, config['input_scalar_scaler']['in_features_identifiers']),
#                     ('nodes_preprocessor', nodes_preprocessor, config['mmgp_nodes_transf']['in_features_identifiers']),
#                 ]
#             )

preprocessor = Pipeline(
    steps=[
        ("preparator", preparator),
        ('column_preprocessor', column_preprocessor),
    ]
)

preprocessor

In [None]:

# preparator = Pipeline(
#     steps=[
#         ("mmgp_preparator", MMGPPreparation(common_mesh_id = 0)),
#         ('mmgp_nodes_transf', MMGPTransformer(**config['mmgp_nodes_transf'])),
#     ]
# )

# preprocessor = Pipeline(
#     steps=[
#         ("preparator", preparator),
#         ('preprocessor', PlaidColumnTransformer(
#                 [
#                     ('input_scalar_scaler', WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])),
#                     ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(n_components=4), **config['pca_nodes'])),
#                 ]
#             )
#         ),
#     ]
# )

# preprocessor

In [None]:
# dataset_transf = preparator.fit_transform(dataset_train)

In [None]:
# print(dataset_transf[0].get_nodes().shape)
# print(dataset_transf[1].get_nodes().shape)


In [None]:

kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5)

gpr = GaussianProcessRegressor(
    kernel=kernel,
    optimizer='fmin_l_bfgs_b',
    n_restarts_optimizer=1,
    random_state=42)

reg = MultiOutputRegressor(gpr)

def length_scale_init(X):
    return np.ones(X.shape[1])

dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init}

regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory)

postprocessor = Pipeline(
    steps=[
        ("mmgp_u1_transf", MMGPTransformer(**config['mmgp_u1_transf'])),
        ('pca_u1', WrappedPlaidSklearnTransformer(PCA(n_components=4), **config['pca_u1'])),
    ]
)


target_regressor = PlaidTransformedTargetRegressor(
    regressor=regressor,
    transformer=postprocessor,
    # out_features_identifiers = config['pca_u1']['in_features_identifiers']
)
target_regressor

In [None]:
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", target_regressor),
    ]
)
pipeline

In [None]:
pipeline.fit(dataset_train)

In [None]:
dataset_pred = pipeline.predict(dataset_train)

In [None]:
for index in range(4):
    print("rel_dif =", np.linalg.norm(dataset_pred[index].get_field("U1") - dataset_train[index].get_field("U1")))

In [None]:
pipeline.score(dataset_train)

In [None]:
print(dataset_pred[0].get_field_names())

In [None]:
print(dataset_train[0].get_field_names())

In [None]:
target_regressor

In [None]:
pipeline.get_params()['preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components']

In [None]:
param_grid = {
    'preprocessor__preparator__common_mesh_id': [0, 2],
    'regressor__transformer__pca_u1__sklearn_block__n_components': [2],
    'preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components': [2]
}

cv = KFold(n_splits=2, shuffle=True, random_state=42)
search = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, verbose=3, error_score='raise')
search.fit(dataset_train)

In [None]:
print("best_params =", search.best_params_)
optimized_pipeline = clone(pipeline).set_params(**search.best_params_)
optimized_pipeline.fit(dataset_train)

In [None]:
dataset_pred = optimized_pipeline.predict(dataset_train)
score = optimized_pipeline.score(dataset_train)
print("score =", score, ", error =", 1. - score)

In [None]:
for index in range(len(dataset_pred)):
    print("rel_dif =", np.linalg.norm(dataset_pred[index].get_field("U1") - dataset_train[index].get_field("U1")))

In [None]:
optimal_common_mesh_id = search.best_params_['preprocessor__preparator__common_mesh_id']
print("optimal_common_mesh_id =", optimal_common_mesh_id)
optimized_pipeline = clone(pipeline).set_params(
    preprocessor__preparator__common_mesh_id = optimal_common_mesh_id,
    regressor__transformer__pca_u1__sklearn_block__n_components = len(dataset_train),
    preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = len(dataset_train)
)
optimized_pipeline.fit(dataset_train)

In [None]:
dataset_pred = optimized_pipeline.predict(dataset_train)
for index in range(len(dataset_pred)):
    print(f"rel_dif(id={index}) =", np.linalg.norm(dataset_pred[index].get_field("U1") - dataset_train[index].get_field("U1")))
print(f"error at id {optimal_common_mesh_id } should be zero")