# Exemple of pipeline PCA-GP-PCA type

In [1]:
import os
from pathlib import Path
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, MetaEstimatorMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.datasets import make_regression
from sklearn.gaussian_process import GaussianProcessRegressor

from plaid.containers.dataset import Dataset
from plaid.containers.sample import Sample
from plaid.problem_definition import ProblemDefinition
from plaid.wrappers.sklearn import WrappedSklearnTransform, WrappedSklearnRegressor

## Load Rotor37 PLAID dataset

In [2]:
# data_dir = Path(os.environ['AIRFRANS_PLAID_DATASET_PATH'])
data_dir = Path('/gpfs_new/cold-data/InputData/public_datasets/data_challenge/Rotor37')

In [3]:
# dset = Dataset(data_dir/'dataset', processes_number=2)
dset = Dataset()
dset._load_from_dir_(data_dir/'dataset',ids = np.arange(13))
problem_def = ProblemDefinition(data_dir/'problem_definition')

FileNotFoundError: "\gpfs_new\cold-data\InputData\public_datasets\data_challenge\Rotor37\dataset" is not a directory or does not exist. Abort

In [None]:
print(f"{dset.get_scalar_names()=}")
print(f"{dset.get_field_names()=}")

## PCA-GP-PCA as a sklearn `Pipeline`

### 1. Define the PCA for the shape embedding

In this example we only apply PCA to the first 8 columns

The last two columns are unchanged

In [None]:
NB_PCA_MODES = 8
from sklearn.decomposition import PCA
pca = WrappedSklearnTransform(
            PCA(NB_PCA_MODES),
            in_keys='field::all',
            # in_keys=['omega', 'compression_rate'],
            out_keys=[f'scalar::pca{i_mode}' for i_mode in range(NB_PCA_MODES)],
        )

In [None]:
pca.fit(dset, problem_def)

In [None]:
feats_to_reduce = list(range(8))
preprocessor = ColumnTransformer(
    transformers=[
        (
            "pca",
            PCA(n_components=8),
            feats_to_reduce,
        ),
    ],
    remainder="passthrough",
)
preprocessor

### 2. Define the output scaler for the output fields (MinMaxScaler + PCA)

In [None]:
postprocessor = Pipeline(
    [
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=9)),
    ]
)
postprocessor

### 3. Define the regressor

Y = GP(transformer(X)) where transformer(X) = postprocessor(X)

In [None]:
regressor = TransformedTargetRegressor(
    regressor=GaussianProcessRegressor(n_restarts_optimizer=3),
    check_inverse=False,
    transformer=postprocessor,
)
regressor

### 4. Combine to make the pipeline

In [None]:
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("regressor", regressor),
    ]
)
model

## Fit the model

In [None]:
model.fit(dset, problem_def)
model

## Predict on the training data

In [None]:
y_pred = model.predict(dset)

## Other way to define the pipeline

### 1. Define the regressor

In [None]:
regressor = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("regressor", GaussianProcessRegressor(n_restarts_optimizer=3)),
    ]
)
regressor

### 2. Combine to make the pipeline

In [None]:
model = TransformedTargetRegressor(
    regressor=regressor,
    check_inverse=False,
    transformer=postprocessor,
)
model