# Sklearn pipelines

An easy way to integrate your data preprocessing, is by using sklearn pipelines. This allows you to wrap the steps that your data undergoes in a single pipeline object.

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.compose import TransformedTargetRegressor
import lightgbm as lgb
import numpy as np

## Regression pipeline

In [2]:
df = pd.read_csv('data/chl_regression_tutorial.csv')
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

features = ['rho_443_a', 'rho_492_a', 'rho_560_a', 'rho_665_a', 'rho_704_a', 'rho_740_a', 'rho_783_a', 'rho_865_a']
target = 'CHL'

X_train = df_train[features]
y_train = df_train[target]

X_test = df_test[features]
y_test = df_test[target]

In [3]:
scaler = StandardScaler()
regressor = lgb.LGBMRegressor(verbosity=-1)

pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('regressor', regressor)
])

pipeline

In [4]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

MSE: 3.0883706751400695


## Adding the target transformation

In [5]:
def inverse_log10(x: pd.Series) -> pd.Series:
    """Inverse of the log10 function. Used in the target transformer,
    as a function with only one argument is needed.

    Args:
        x (pd.Series): Input to be inverse transformed.

    Returns:
        pd.Series: Inverse transformed input.
    """
    return np.power(10, x)

label_transformer = FunctionTransformer(
        func=np.log10, inverse_func=inverse_log10, check_inverse=False)

wrapped_model = TransformedTargetRegressor(
    regressor=pipeline,
    transformer=label_transformer,
)

# Outputting the pipeline in a notebook cell will display the pipeline
wrapped_model

In [6]:
wrapped_model.fit(X_train, y_train)
y_pred = wrapped_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

MSE: 3.078688884490468


## Classification pipeline

As for classification, a similar approach is taken. The regressor is replaced with a classifier. In this case, transforming the target does not make sense.

In [7]:
df = pd.read_csv('data/benthic_classification_tutorial.csv')
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

features = ['rhos_443', 'rhos_492', 'rhos_560', 'rhos_665', 'rhos_704', 'rhos_740', 'rhos_783', 'rhos_833', 'rhos_865']
target = 'lev1_name'

# drop samples from df_test that are not present in df_train
df_test = df_test[df_test['lev1_name'].isin(df_train['lev1_name'])]

X_train = df_train[features]
y_train = df_train[target]

X_test = df_test[features]
y_test = df_test[target]

In [8]:
scaler = StandardScaler()
classifier = lgb.LGBMClassifier(verbosity=-1)

pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('classifier', classifier)
])

pipeline

In [9]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

clf_report = classification_report(y_test, y_pred)

print(clf_report)

              precision    recall  f1-score   support

       ALGAE       0.71      0.82      0.76       192
       ANGIO       0.76      0.59      0.67        27
   BEACHCAST       0.00      0.00      0.00         3
       GRASS       0.94      0.89      0.92        19
        ROCK       0.68      0.62      0.65        73
    SEDIMENT       0.61      0.47      0.53        60

    accuracy                           0.71       374
   macro avg       0.62      0.57      0.59       374
weighted avg       0.70      0.71      0.70       374

