<a href="https://colab.research.google.com/github/StratagemGIS/notebooks/blob/main/best_practices/45_sklearn_pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Vaasudevan Srinivasan 🧑🏻‍💻  
StratagemGIS Solutions

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
penguins_url = 'https://raw.githubusercontent.com/StratagemGIS/datasets/main/table/penguins.csv'
df = pd.read_csv(penguins_url)
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [None]:
num_cols = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
cat_cols = ['island', 'sex']

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    df.drop(columns=['species']),
    df['species'],
    test_size=0.2,
    random_state=100
)

In [None]:
pipeline.fit(x_train, y_train)
pipeline.score(x_test, y_test)

0.9710144927536232