# Try to answer some questions about sklearn Pipelines ...

- does a pretrained block is trained again when use in a `Pipeline` and that the `Pipeline` is trained ? -> Yes, it seems 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor

### Generate 2 random datasets

In [2]:
# Generate Data
X, y = make_regression(n_samples=100, n_features=5, noise=0.1, n_targets=3)
print(f"{X.shape=}")
print(f"{y.shape=}")
X_2, y_2 = make_regression(n_samples=217, n_features=5, noise=0.1, n_targets=3)

X.shape=(100, 5)
y.shape=(100, 3)


In [3]:
df = pd.DataFrame(np.concatenate([X,y], axis=1), )
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.101627,1.018820,0.470369,0.284386,1.113632,241.518602,171.042817,264.240745
1,-0.363111,0.179234,-0.360975,0.028255,0.815165,2.097236,-36.954122,48.925486
2,-1.202856,-0.385704,-0.396730,1.643193,1.324302,-72.970557,39.694763,87.766241
3,-0.412268,0.573827,1.563272,0.518470,-1.033196,14.909302,76.340839,-9.249414
4,-1.918432,-0.671207,0.888769,-0.932070,0.671807,-192.592773,-235.717073,-94.698003
...,...,...,...,...,...,...,...,...
95,1.160525,-0.401518,-0.197663,2.189675,-1.879300,33.279162,294.681735,-36.221280
96,0.364104,-1.169832,-0.290085,0.277554,-0.030041,-57.714625,34.438234,-32.178592
97,-1.098167,0.304946,1.262252,-0.143388,0.387497,-31.499173,-56.750180,29.314709
98,1.115154,-0.091066,0.551391,-0.953668,1.315091,150.790454,41.860030,183.094724


### Create a `PCA` block and train it

In [4]:
pca = PCA(n_components=2)
pca.fit(X,y)
pca

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [5]:
pca.components_

array([[-0.12175749, -0.45159136,  0.72958649, -0.46610308,  0.17802199],
       [-0.05659178, -0.18846783,  0.46317364,  0.78165884, -0.368452  ]])

### Use the pretrained block in a `Pipeline`

In [6]:
pipe = Pipeline([
    ('pca',pca),
    ('RF',RandomForestRegressor(n_estimators=10)),
    ])
pipe

0,1,2
,steps,"[('pca', ...), ('RF', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [7]:
pca.components_

array([[-0.12175749, -0.45159136,  0.72958649, -0.46610308,  0.17802199],
       [-0.05659178, -0.18846783,  0.46317364,  0.78165884, -0.368452  ]])

### Train the `Pipeline`, on an other training dataset

In [8]:
pipe.fit(X_2,y_2)

0,1,2
,steps,"[('pca', ...), ('RF', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


The PCA modes are different -> the PCA block has been retrained

In [9]:
pca.components_

array([[-0.25460414,  0.20443663, -0.15248206,  0.88159277, -0.30483735],
       [ 0.72626202, -0.12582964,  0.08201559,  0.43088328,  0.51412393]])

### Re-Train the `Pipeline`, on an other training dataset

In [10]:
pipe.fit(X,y)

0,1,2
,steps,"[('pca', ...), ('RF', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


The PCA modes are different -> the PCA block has been retrained

In [11]:
pca.components_

array([[-0.12175749, -0.45159136,  0.72958649, -0.46610308,  0.17802199],
       [-0.05659178, -0.18846783,  0.46317364,  0.78165884, -0.368452  ]])

### The block do not seem to contain any attribute stating the "trained/not trained" status

In [12]:
pca.__dict__

{'n_components': 2,
 'copy': True,
 'whiten': False,
 'svd_solver': 'auto',
 'tol': 0.0,
 'iterated_power': 'auto',
 'n_oversamples': 10,
 'power_iteration_normalizer': 'auto',
 'random_state': None,
 'n_features_in_': 5,
 '_fit_svd_solver': 'covariance_eigh',
 'mean_': array([ 0.21524085, -0.0456532 ,  0.14250158, -0.04409483,  0.13925886]),
 'noise_variance_': 0.8521014630535676,
 'n_samples_': 100,
 'n_components_': 2,
 'components_': array([[-0.12175749, -0.45159136,  0.72958649, -0.46610308,  0.17802199],
        [-0.05659178, -0.18846783,  0.46317364,  0.78165884, -0.368452  ]]),
 'explained_variance_': array([1.42164061, 1.00586068]),
 'explained_variance_ratio_': array([0.28525201, 0.20182582]),
 'singular_values_': array([11.86349109,  9.97898831])}

In [13]:
sorted([v for v in dir(pca) if v[0]!='_'])

['components_',
 'copy',
 'explained_variance_',
 'explained_variance_ratio_',
 'fit',
 'fit_transform',
 'get_covariance',
 'get_feature_names_out',
 'get_metadata_routing',
 'get_params',
 'get_precision',
 'inverse_transform',
 'iterated_power',
 'mean_',
 'n_components',
 'n_components_',
 'n_features_in_',
 'n_oversamples',
 'n_samples_',
 'noise_variance_',
 'power_iteration_normalizer',
 'random_state',
 'score',
 'score_samples',
 'set_output',
 'set_params',
 'singular_values_',
 'svd_solver',
 'tol',
 'transform',
 'whiten']

In [14]:
sorted([v for v in dir(pca) if (v[0]=='_' and v[1]!='_')])

['_abc_impl',
 '_build_request_for_signature',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_fit',
 '_fit_full',
 '_fit_svd_solver',
 '_fit_truncated',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_params_html',
 '_html_repr',
 '_n_features_out',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sklearn_auto_wrap_output_keys',
 '_transform',
 '_validate_params']

In [17]:
pca._get_doc_link()

'https://scikit-learn.org/1.7/modules/generated/sklearn.decomposition.PCA.html'

In [18]:
pca._html_repr()

'<style>#sk-container-id-5 {\n  /* Definition of color scheme common for light and dark mode */\n  --sklearn-color-text: #000;\n  --sklearn-color-text-muted: #666;\n  --sklearn-color-line: gray;\n  /* Definition of color scheme for unfitted estimators */\n  --sklearn-color-unfitted-level-0: #fff5e6;\n  --sklearn-color-unfitted-level-1: #f6e4d2;\n  --sklearn-color-unfitted-level-2: #ffe0b3;\n  --sklearn-color-unfitted-level-3: chocolate;\n  /* Definition of color scheme for fitted estimators */\n  --sklearn-color-fitted-level-0: #f0f8ff;\n  --sklearn-color-fitted-level-1: #d4ebff;\n  --sklearn-color-fitted-level-2: #b3dbfd;\n  --sklearn-color-fitted-level-3: cornflowerblue;\n\n  /* Specific color for light theme */\n  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n  --sklearn-color-bord

In [None]:
sorted([v for v in dir(pca) if v[:2]=='__'])