# [User Guide](https://scikit-learn.org/stable/modules/compose.html)

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR

In [2]:
data = pd.read_csv("mpg.csv", index_col="name")
data.head()

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa


In [3]:
X = data.drop(columns=["mpg"])
y = data["mpg"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, test_size=0.3)

## I. Applying transformations Manually

In [4]:
column_transformer = ColumnTransformer(
    [
        ("categorical", OneHotEncoder(), make_column_selector(dtype_exclude="number")),
        ("numeric", StandardScaler(), make_column_selector(dtype_include="number")),
    ]
)
imputer = SimpleImputer(strategy="mean")

# Intermediate variables are needed to store transformed data
X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)
X_train_imputed = imputer.fit_transform(X_train_transformed)
X_test_imputed = imputer.transform(X_test_transformed)

model = SVR()
params = {"C": np.logspace(1, 5, 5)}
best_model_without_pipeline = GridSearchCV(model, param_grid=params, cv=5, n_jobs=4)
best_model_without_pipeline.fit(X_train_imputed, y_train)
best_model_without_pipeline.score(X_test_imputed, y_test)

0.8157268404635538

## Using a Pipeline

In [5]:
# This pipeline encapsulates all transformations & the model
model_pipeline = make_pipeline(
    ColumnTransformer(
        [
            ("categorical", OneHotEncoder(), make_column_selector(dtype_exclude="number")),
            ("numeric", StandardScaler(), make_column_selector(dtype_include="number")),
        ]
    ),
    SimpleImputer(strategy="mean"),
    SVR(),
)
params = {"svr__C": np.logspace(1, 5, 5)}
best_model_with_pipeline = GridSearchCV(model_pipeline, param_grid=params, cv=5, n_jobs=4)
best_model_with_pipeline.fit(X_train, y_train)  # No intermediate variables necessary
best_model_with_pipeline.score(X_test, y_test)

0.8157268404635538

## Making Predictions on New Data

In [6]:
new_row = pd.DataFrame([(9, 250.0, 150.0, 4100, 11.6, 84, "usa")], columns=X.columns)
new_row

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,9,250.0,150.0,4100,11.6,84,usa


In [7]:
best_model_with_pipeline.predict(new_row)

array([22.08408388])

In [8]:
new_row_transformed = imputer.transform(column_transformer.transform(new_row))


best_model_without_pipeline.predict(new_row_transformed)

array([22.08408388])