# Scikit Learn Pipelines

Pipelines chains together multiple steps so that the output of each step is used as input to the next step. Pipelines makes it easy to apply the same preprocessing to train and test!

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("../Datasets/titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Without Using Pipeline
### Create a Model

In [3]:
df.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace = True)

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
# step 1 -> train/test/split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns = ["Survived"], axis = 1),
    df["Survived"],
    test_size = 0.2,
    random_state = 42
)

In [8]:
# applying imputation
si_age           = SimpleImputer()
si_embarked      = SimpleImputer(strategy = "most_frequent")

X_train_age      = si_age.fit_transform(X_train[["Age"]])
X_train_embarked = si_embarked.fit_transform(X_train[["Embarked"]])

X_test_age       = si_age.transform(X_test[["Age"]])
X_test_embarked  = si_embarked.transform(X_test[["Embarked"]])

In [9]:
# OneHotEncoding "Sex" & "EMbarked" column
ohe_sex          = OneHotEncoder(sparse = False, handle_unknown = "ignore")
ohe_embarked     = OneHotEncoder(sparse = False, handle_unknown = "ignore")

X_train_sex      = ohe_sex.fit_transform(X_train[["Sex"]])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex       = ohe_sex.transform(X_test[["Sex"]])
X_test_embarked  = ohe_embarked.transform(X_test_embarked)

In [10]:
# now time to create new array with transformed data

X_train_remaining   = X_train.drop(columns = ["Sex", "Age", "Embarked"], axis = 1)
X_test_remaining    = X_test.drop(columns = ["Sex", "Age", "Embarked"], axis = 1)

X_train_transformed = np.concatenate((X_train_remaining, X_train_age, X_train_sex, X_train_embarked), axis = 1)
X_test_transformed  = np.concatenate((X_test_remaining, X_test_age, X_test_sex, X_test_embarked), axis = 1)

In [11]:
print(X_train_transformed.shape)
print(X_test_transformed.shape)

(712, 10)
(179, 10)


In [12]:
# now build DecisionTree Classifier model

clf = DecisionTreeClassifier()

clf.fit(X_train_transformed, y_train)

DecisionTreeClassifier()

In [13]:
y_pred = clf.predict(X_test_transformed)

In [14]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.776536312849162

In [16]:
# save the required models for further use-case

import pickle

pickle.dump(ohe_sex, open("models/ohe_sex.pkl", "wb"))
pickle.dump(ohe_embarked, open("models/ohe_embarked.pkl", "wb"))
pickle.dump(clf, open("models/clf.pkl", "wb"))

### Use the Model

In [18]:
import numpy as np
import pickle

In [20]:
ohe_sex      = pickle.load(open("models/ohe_sex.pkl", "rb"))
ohe_embarked = pickle.load(open("models/ohe_embarked.pkl", "rb"))
clf          = pickle.load(open("models/clf.pkl", "rb"))

In [23]:
# assuming user input
# Pclass/gender/age/SibSp/Parch/Fare/Embarked
test_input = np.array([2, "male", 31.0, 0, 0, 10.5, "S"], dtype=object).reshape(1, 7)
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [24]:
test_input_sex         = ohe_sex.transform(test_input[:, 1].reshape(1, 1))
test_input_embarked    = ohe_embarked.transform(test_input[:, -1].reshape(1, 1))
test_input_age         = test_input[:, 2].reshape(1, 1)

test_input_transformed = np.concatenate(
    (test_input[:, [0, 3, 4, 5]], test_input_age, test_input_sex, test_input_embarked),
    axis = 1
)
test_input_transformed.shape

(1, 10)

In [25]:
clf.predict(test_input_transformed)

array([1], dtype=int64)

## Using Pipeline

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [41]:
df = pd.read_csv("../Datasets/titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [42]:
df.drop(columns = ["PassengerId", "Name", "Ticket", "Cabin"], inplace = True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### Create Transformer Chains

In [43]:
# Step 1 -> train/test/split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns = ["Survived"], axis = 1),
    df["Survived"],
    test_size = 0.2,
    random_state = 42
)

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [44]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [45]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [46]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [47]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)

In [48]:
# train the model
trf5 = DecisionTreeClassifier()

### Create Pipeline

In [49]:
pipe = Pipeline([
    ("trf1", trf1),
    ("trf2", trf2),
    ("trf3", trf3),
    ("trf4", trf4),
    ("trf5", trf5)
])

### Pipeline vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [None]:
# alternate syntax
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [51]:
# train. If we will do only the transformation processes like OHE rather than creating the model,
# then we will call "fit_transform" rather than "fit" method. We can use "fit" that case but then 
# we have to transform explicetly by "transform" method.
from sklearn import set_config

set_config(display = "diagram")

pipe.fit(X_train, y_train)

### Explore the Pipeline

In [52]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000001B1D0F91EE0>),
 'trf5': DecisionTreeClassifier()}

In [53]:
pipe.named_steps["trf1"]

In [54]:
pipe.named_steps["trf1"].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [55]:
pipe.named_steps["trf1"].transformers_[0]

('impute_age', SimpleImputer(), [2])

In [56]:
pipe.named_steps["trf1"].transformers_[0][1]

In [57]:
pipe.named_steps["trf1"].transformers_[0][1].statistics_

array([29.49884615])

In [58]:
pipe.named_steps["trf1"].transformers_[1]

('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])

In [59]:
pipe.named_steps["trf1"].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [60]:
# predict
y_pred = pipe.predict(X_test)

# accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6256983240223464

### Cross Validation using Pipeline

In [61]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_train, y_train, cv = 5, scoring = "accuracy").mean()

0.6391214419383433

### GridSearch using Pipeline

In [62]:
# gridsearchcv
params = {
    "trf5__max_depth": [1, 2, 3, 4, 5, None]
}

In [63]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, params, cv = 5, scoring = "accuracy")
grid.fit(X_train, y_train)

In [64]:
grid.best_score_

0.6391214419383433

In [65]:
grid.best_params_

{'trf5__max_depth': 2}

### Exporting the Pipeline

In [66]:
# export
import pickle

pickle.dump(pipe, open("./models/pipe.pkl", "wb"))

### Production Code

In [67]:
import numpy as np
import pickle

In [68]:
pipe = pickle.load(open("./models/pipe.pkl", "rb"))

In [69]:
# assuming user input
test_input2 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'], dtype = object).reshape(1, 7)
test_input2

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [70]:
pipe.predict(test_input2)

array([0], dtype=int64)