# Building a scikit-learn pipeline

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn import __version__ as scikit_version

import pickle
import joblib

In [2]:
cali_data = fetch_california_housing(as_frame=True)
cali_data.data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [3]:
training_data = cali_data.data
target_value = cali_data.target

X_train, X_test, y_train, y_test = train_test_split(training_data, target_value, test_size = 0.2, random_state=5) 

In [4]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('std_scaler', StandardScaler()),
    ('algorithm_regression', LinearRegression())
    ])

In [5]:
pipeline.fit(X_train, y_train)

In [6]:
train_score = pipeline.score(X_train,y_train)
test_score = pipeline.score(X_test,y_test)

print(f"Training set score: {train_score:.2f}")
print(f"Test  set score: {test_score:.2f}")

Training set score: 0.60
Test  set score: 0.61


## Checking different model types

In [7]:
alg_choices = [LinearRegression(),
               DecisionTreeRegressor(max_depth=5),
               RandomForestRegressor(max_depth=5)]

pipelines = {}

for alg in alg_choices:
    pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('std_scaler', StandardScaler()),
    ('algorithm', alg)
    ])
    
    pipeline.fit(X_train, y_train)
    
    train_score = pipeline.score(X_train,y_train)
    
    pipelines[type(alg).__name__] = pipeline
    
    print(f"{str(alg)} training set score: {train_score:0.2f}")

LinearRegression() training set score: 0.60
DecisionTreeRegressor(max_depth=5) training set score: 0.63
RandomForestRegressor(max_depth=5) training set score: 0.67


In [8]:
pipeline.steps

[('imputer', SimpleImputer()),
 ('std_scaler', StandardScaler()),
 ('algorithm', RandomForestRegressor(max_depth=5))]

In [9]:
pipeline.steps[0]

('imputer', SimpleImputer())

In [10]:
pipeline[0]

In [11]:
pipeline[0].strategy

'mean'

## Make_pipeline

In [12]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    (SimpleImputer(missing_values=np.nan, strategy='mean')),
    (StandardScaler()),
    (LinearRegression())
    )

## Using your testset

In [13]:
predictions_test = pipelines['DecisionTreeRegressor'].predict(X_test)

In [14]:
predictions = X_test.assign(predictions= predictions_test).assign(actual= y_test)
# new_frame["prediction"] = predictions_test
# new_frame["actual"] = y_test
predictions

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,predictions,actual
14772,2.7361,14.0,4.206612,0.983471,1224.0,3.371901,32.57,-117.07,1.665118,0.93600
10105,3.6106,18.0,4.393468,1.026439,1590.0,2.472784,33.92,-117.95,1.788680,1.53600
20094,2.1250,26.0,37.063492,7.185185,416.0,2.201058,38.19,-120.03,0.859253,1.32500
19261,2.6576,23.0,5.163004,1.120879,1644.0,3.010989,38.44,-122.74,1.271987,1.47900
14139,2.5583,31.0,4.606335,1.133484,1263.0,2.857466,32.75,-117.07,1.271987,1.20700
...,...,...,...,...,...,...,...,...,...,...
11352,2.6442,23.0,4.134259,1.032407,1149.0,5.319444,33.75,-117.92,1.665118,1.56300
16121,3.7419,52.0,5.121891,1.034826,999.0,2.485075,37.79,-122.46,1.788680,5.00001
8976,2.8797,33.0,3.577922,1.054113,2475.0,2.678571,34.01,-118.41,1.665118,2.85300
8439,3.0132,44.0,4.905660,1.094340,392.0,3.698113,33.93,-118.36,1.271987,2.02500


In [15]:
print(pipelines['DecisionTreeRegressor'].score(X_test,y_test))
print(pipelines['RandomForestRegressor'].score(X_test,y_test))
print(pipelines['LinearRegression'].score(X_test,y_test))

0.6300950669272196
0.6742898560305344
0.6112568432827636


In [16]:
pipelines['RandomForestRegressor']

## Finding optimal hyperparameters with GridSearchCV

In [17]:
pipelines = {}

pipelines['RandomForestRegressor']= Pipeline([
('imputer', SimpleImputer()),
('std_scaler', StandardScaler()),
('algorithm', RandomForestRegressor(max_depth=5))
])

In [18]:
pipelines['RandomForestRegressor'].get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('std_scaler', StandardScaler()),
  ('algorithm', RandomForestRegressor(max_depth=5))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'std_scaler': StandardScaler(),
 'algorithm': RandomForestRegressor(max_depth=5),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 'deprecated',
 'std_scaler__copy': True,
 'std_scaler__with_mean': True,
 'std_scaler__with_std': True,
 'algorithm__bootstrap': True,
 'algorithm__ccp_alpha': 0.0,
 'algorithm__criterion': 'squared_error',
 'algorithm__max_depth': 5,
 'algorithm__max_features': 1.0,
 'algorithm__max_leaf_nodes': None,
 'algorithm__max_samples': None,
 'algorithm__min_impurity_decrease': 0.0,
 'algorithm__min_samples_leaf': 1,
 'algorithm__min_samples_split': 2,
 'algorithm__min_weight_fraction_leaf': 0.0,
 'algorithm__n_estimators': 100,
 'algorithm__n_jobs':

In [19]:
grid_parameters = {'imputer__strategy':['mean','median'],
                  'algorithm__criterion':['squared_error', 'absolute_error'],
                   'algorithm__n_estimators':[10,100],
                   'algorithm__max_depth':[5,10,None]
                  }

In [20]:
grid = GridSearchCV(pipelines['RandomForestRegressor'], grid_parameters, cv = 3, n_jobs = -1,verbose=2)

In [21]:
%%time 
grid.fit(X_train,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
CPU times: total: 5min 59s
Wall time: 21min 47s


In [22]:
print(f'The best parameters are:{grid.best_params_}')
print(f'The best score was:{grid.best_score_:.2f}')

The best parameters are:{'algorithm__criterion': 'absolute_error', 'algorithm__max_depth': None, 'algorithm__n_estimators': 100, 'imputer__strategy': 'median'}
The best score was:0.80


In [23]:
best_model = grid.best_estimator_
print(f'{best_model.score(X_test,y_test):.2f}')

0.82


## RandomizedSearchCV

In [24]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(
    pipelines['RandomForestRegressor'], grid_parameters, n_iter=10
)

In [25]:
%%time 
rs = rs.fit(X_train, y_train)

CPU times: total: 1h 3min 39s
Wall time: 1h 4min 6s


In [27]:
print(rs.best_estimator_)

Pipeline(steps=[('imputer', SimpleImputer()), ('std_scaler', StandardScaler()),
                ('algorithm',
                 RandomForestRegressor(criterion='absolute_error'))])


## Versioning your model

In [28]:
model_version = "1"

filename = f'pipeline_random_forest_v{model_version}-{scikit_version}.pk1'
pickle.dump(best_model, open(filename, 'wb'))

In [29]:
# Load the model itself
pickle_file = pickle.load(open(filename, 'rb'))
pickle_file

In [30]:
# Using joblib
model_version = "1"

filename = f'pipeline_random_forest_joblib_v{model_version}-{scikit_version}.pk1'
 
joblib.dump(best_model, filename, compress=7)

['pipeline_random_forest_joblib_v1-1.1.3.pk1']

In [31]:
# Using joblib
model_version = "1"

filename = f'random_forest_joblib_v{model_version}-{scikit_version}.pk1'
 
joblib.dump(pipelines['RandomForestRegressor'], filename, compress=7)

['random_forest_joblib_v1-1.1.3.pk1']

In [32]:
# Load the joblib file
pickle_file = joblib.load(filename)
pickle_file