# Machine Learning Pipelines
Pipelines are a simple way to keep your data preprocessing and modeling code organized. Specifically, a pipeline bundles preprocessing and modeling steps so you can use the whole bundle as if it were a single step.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt
from sklearn import metrics

In [2]:
df = pd.read_csv("../../Datasets/mpg data/auto-mpg.data.csv")
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','car name']
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
1,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
2,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
3,17.0,8,302.0,140,3449,10.5,70,1,ford torino
4,15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


Separating features and target

In [3]:
y = df.mpg
X = df.drop(['mpg'], axis=1)#mpg drop
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,8,350.0,165,3693,11.5,70,1,buick skylark 320
1,8,318.0,150,3436,11.0,70,1,plymouth satellite
2,8,304.0,150,3433,12.0,70,1,amc rebel sst
3,8,302.0,140,3449,10.5,70,1,ford torino
4,8,429.0,198,4341,10.0,70,1,ford galaxie 500
...,...,...,...,...,...,...,...,...
392,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,4,97.0,52,2130,24.6,82,2,vw pickup
394,4,135.0,84,2295,11.6,82,1,dodge rampage
395,4,120.0,79,2625,18.6,82,1,ford ranger


Performing train test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Determining which columns are categorical (number of unique values are less than 10 equals categorical column)

In [5]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]

Determining which columns are numerical (float and int data type columns are numerical)

In [6]:
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [7]:
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_test = X_test[my_cols].copy()

In [8]:
X_train

Unnamed: 0,cylinders,displacement,weight,acceleration,model year,origin
16,6,200.0,2587,16.0,70,1
66,8,429.0,4633,11.0,72,1
379,4,120.0,2160,14.5,82,3
7,8,455.0,4425,10.0,70,1
19,4,110.0,2672,17.5,70,2
...,...,...,...,...,...,...
71,8,304.0,3892,12.5,72,1
106,6,232.0,2789,15.0,73,1
270,4,156.0,2745,16.7,78,1
348,4,91.0,1985,16.0,81,3


In [9]:
X_test

Unnamed: 0,cylinders,displacement,weight,acceleration,model year,origin
114,8,350.0,4082,13.0,73,1
278,4,98.0,2135,16.6,78,3
237,4,98.0,2075,15.9,77,1
57,4,97.5,2126,17.0,72,1
72,8,307.0,4098,14.0,72,1
...,...,...,...,...,...,...
255,6,225.0,3430,17.2,78,1
284,8,305.0,3840,15.4,79,1
75,4,121.0,2933,14.5,72,2
333,3,70.0,2420,12.5,80,3


Encode categorical features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features.

In [10]:
# categorical_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

Encode categorical features as an integer array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features

In [11]:
# categorical_transformer = Pipeline(steps=[
#     ('labelenc', OrdinalEncoder())
# ])

Bundle preprocessing for numerical and categorical data to create 1 single preprocesser

In [12]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', categorical_transformer, categorical_cols)
#     ])

In [13]:
# model = RandomForestClassifier( max_depth = 8, random_state=0)
model = RandomForestRegressor(random_state=0)

## We use the Pipeline class to define a **pipeline** that bundles the preprocessing and modeling steps

- With the pipeline, we preprocess the training data and fit the model in a single line of code. (In contrast, without a pipeline, we have to do imputation, one-hot encoding, and model training in separate steps. This becomes especially messy if we have to deal with both numerical and categorical variables!)

- With the pipeline, we supply the unprocessed features to the **predict()** command, and the pipeline automatically preprocesses the features before generating predictions. (However, without a pipeline, we have to remember to preprocess the validation data before making predictions.)

In [14]:
my_pipeline = Pipeline(steps=[('model', model)])

In [15]:
my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('model', RandomForestRegressor(random_state=0))])

In [16]:
preds = my_pipeline.predict(X_test)

In [17]:
preds

array([13.53 , 32.896, 30.067, 26.92 , 14.79 , 35.063, 14.3  , 14.565,
       17.023, 27.173, 14.636, 21.418, 28.66 , 23.282, 28.85 , 25.368,
       12.33 , 20.429, 20.429, 29.174, 21.404, 19.195, 26.833, 13.947,
       12.34 , 24.741, 26.74 , 18.523, 24.415, 17.205, 13.595, 24.724,
       33.249, 25.169, 20.56 , 40.691, 18.859, 21.216, 33.05 , 24.989,
       12.975, 27.232, 29.185, 23.371, 24.785, 22.94 , 11.68 , 18.313,
       30.211, 26.375, 35.126, 13.815, 17.248, 13.45 , 24.828, 16.556,
       32.38 , 15.565, 24.45 , 14.565, 23.75 , 29.075, 20.641, 23.92 ,
       22.085, 25.898, 25.246, 25.814, 21.028, 25.558, 14.235, 21.175,
       34.547, 31.313, 15.809, 29.534, 26.042, 15.705, 16.332, 28.475,
       20.351, 25.564, 24.764, 14.015, 35.68 , 11.72 , 37.789, 21.224,
       31.08 , 26.926, 28.655, 34.741, 25.733, 36.595, 29.023, 19.779,
       16.736, 20.954, 36.091, 20.475])

## Calculating performance metrics of the trained model

In [18]:
metrics.max_error(preds, y_test)

17.359

Mean absolute error regression loss

In [19]:
metrics.mean_absolute_error(preds, y_test)

1.9735100000000005

Mean squared error regression loss

In [20]:
metrics.mean_squared_error(preds, y_test)

9.514220110000002

If **squared** = True, returns MSE value, if False returns RMSE value

In [21]:
metrics.mean_squared_error(preds, y_test, squared=False)

3.0845129453448563

R<sup>2</sup> (coefficient of determination) regression score function.

Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a
score of 0.0.

In [22]:
metrics.r2_score(preds, y_test, multioutput='variance_weighted')

0.8018904964621999

### Training using normalization

Normalize samples individually to unit norm.

Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples

In [23]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()

In [24]:
scaler.fit(X_train)

Normalizer()

In [25]:
norm_X = scaler.transform(X_train)

In [26]:
norm_X.shape

(297, 6)

In [27]:
Xn_train, Xn_test, yn_train, yn_test = train_test_split(norm_X,y_train,test_size = 0.15, random_state=42)

In [28]:
from sklearn.linear_model import LinearRegression
reg_norm = LinearRegression().fit(Xn_train, yn_train)

In [29]:
norm_preds = reg_norm.predict(Xn_test)

In [30]:
norm_preds.shape

(45,)

In [31]:
metrics.max_error(norm_preds, yn_test)

9.887051163662598

In [32]:
metrics.mean_absolute_error(norm_preds, yn_test)

1.9880152674212708

In [33]:
metrics.mean_squared_error(norm_preds, yn_test)

9.600990457779464

In [34]:
metrics.mean_squared_error(norm_preds, yn_test)

9.600990457779464

In [35]:
metrics.r2_score(norm_preds, yn_test, multioutput='variance_weighted')

0.828583668466186