In [None]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')
warnings.filterwarnings("ignore", message=".*IProgress not found.*")

import os
from pathlib import Path

import yaml
import numpy as np
import optuna

from datasets.utils.logging import disable_progress_bar
from datasets import load_dataset

from sklearn.base import clone
from pipefunc import Pipeline, pipefunc

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern
from sklearn.multioutput import MultiOutputRegressor

from sklearn.model_selection import KFold, GridSearchCV

from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition
from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor
from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer

disable_progress_bar()
n_processes = min(max(1, os.cpu_count()), 6)


In [None]:
hf_dataset = load_dataset("PLAID-datasets/VKI-LS59", split="all_samples[:24]")
dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, processes_number = n_processes, verbose = False)

In [None]:
try:
    filename = Path(__file__).parent.parent.parent / "docs" / "source" / "notebooks" / "config_pipeline.yml"
except NameError:
    filename = "config_pipeline.yml"

with open(filename, 'r') as f:
    config = yaml.safe_load(f)

all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\
    config['pca_nodes']['in_features_identifiers'] + config['pca_mach']['in_features_identifiers']

In [None]:
preprocessor = PlaidColumnTransformer(
    [
        ('input_scalar_scaler', WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])),
        ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(), **config['pca_nodes'])),
    ]
)
preprocessor

In [None]:
postprocessor = WrappedPlaidSklearnTransformer(PCA(), **config['pca_mach'])
postprocessor

In [None]:
kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5)

gpr = GaussianProcessRegressor(
    kernel=kernel,
    optimizer='fmin_l_bfgs_b',
    n_restarts_optimizer=1,
    random_state=42)

reg = MultiOutputRegressor(gpr)

def length_scale_init(X):
    return np.ones(X.shape[1])

dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init}

regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory)

target_regressor = PlaidTransformedTargetRegressor(
    regressor=regressor,
    transformer=postprocessor
)
target_regressor

In [None]:
@pipefunc(output_name="preprocessed_dataset")
def preprocess(dataset):
    return preprocessor.fit_transform(dataset)

@pipefunc(output_name="fitted_regressor")
def regress(preprocessed_dataset):
    return target_regressor.fit(preprocessed_dataset)

pipeline_train = Pipeline(
    [preprocess, regress],
    debug=True,  # optionally print debug information
    profile=True,  # optionally profile the pipeline
    cache_type="hybrid",  # optionally cache the pipeline
)




In [None]:
pipeline_train(dataset=dataset_train)

In [None]:
print("Graph nodes:", pipeline_train.graph.nodes)
pipeline_train.visualize()

In [None]:
@pipefunc(output_name="c")
def f(a, b):
    return a + b


@pipefunc(output_name="d")
def g(b, c, x=1):  # "c" is the output of f
    return b * c * x


@pipefunc(output_name="e")
def h(c, d, x=1):  # "d" is the output of g
    return c * d * x

In [None]:
pipeline = Pipeline(
    [f, g, h],
    debug=True,  # optionally print debug information
    profile=True,  # optionally profile the pipeline
    cache_type="hybrid",  # optionally cache the pipeline
)

In [None]:
pipeline(a=1, b=2)