# Scikit PipeLine

In [1]:
#Getting data ready
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Modelling

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
#Import data and drop rows with missing values

df = pd.read_csv("car-sales-extended-missing-data.csv")
df

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
#checking the missing values
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [5]:
df.dropna(subset = ["Price"],inplace = True)

In [6]:
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [7]:
# Define different features and transformer pipeline

categorical_feature = ["Make","Colour"]
categorical_transformer = Pipeline(steps =[
    ("imputer",SimpleImputer(strategy = "constant",fill_value = "missing")),
    ("onehot",OneHotEncoder(handle_unknown = "ignore"))
])

door_feature = ["Doors"]
door_transformer = Pipeline(steps= [
    ("imputer",SimpleImputer(strategy = "constant",fill_value = 4))
])

num_feature = ["Odometer (KM)"]
num_transformer = Pipeline(steps = [
    ("imputer",SimpleImputer(strategy = "mean"))
])

# Setup Preprocessing steps(fill missing values then convert to numbers)

preprocessor = ColumnTransformer(
                        transformers = [
                            ("cat",categorical_transformer,categorical_feature),
                            ("door",door_transformer,door_feature),
                            ("num",num_transformer,num_feature)
                        ])

# Creating a preprocessor and modeling pipeline
model = Pipeline(steps = [
    ("preprocessor",preprocessor),
    ("model",RandomForestRegressor())
])

# Split the data 
X = df.drop(["Price"],axis = 1)
y = df["Price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30,random_state  = 22)

# Fit the data 
model.fit(X_train,y_train)
model.score(X_test,y_test)


0.2128759424933917

In [8]:
# # Setup Preprocessing steps(fill missing values then convert to numbers)

# preprocessor = ColumnTransformer(
#                         transformers = [
#                             ("cat",categorical_transformer,categorical_feature),
#                             ("door",door_transformer,door_feature),
#                             ("num",num_transformer,num_feature)
#                         ])

# # Creating a preprocessor and modeling pipeline
# model = Pipeline(steps = [
#     ("preprocessor",preprocessor),
#     ("model",RandomForestRegressor())
# ])

In [10]:
# Split the data 
X = df.drop(["Price"],axis = 1)
y = df["Price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30,random_state  = 22)

# Fit the data 
model.fit(X_train,y_train)
model.score(X_test,y_test)


0.2201482452831104

**It's also possible to use GridSearch CV and Randomized SearchCV with pipeline..**

In [13]:
from sklearn.model_selection import GridSearchCV

pipe_grid = {
    "preprocessor__num__imputer__strategy":["mean","median"],
    "model__n_estimators":[300,1000],
    "model__max_depth":[None,5],
    "model__max_features":["auto"],
    "model__min_samples_split":[2,4]
}

gs_model = GridSearchCV(model,pipe_grid,cv = 5,verbose = 2)
gs_model.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean, total=   1.2s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean, total=   1.0s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean, total=   1.0s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean, total=   1.5s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=300, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, 

[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=300, preprocessor__num__imputer__strategy=median, total=   0.8s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=300, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=300, preprocessor__num__imputer__strategy=median, total=   0.8s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=300, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=300, preprocessor__num__imputer__strategy=median, total=   0.8s
[CV] model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_feat

[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean, total=   2.1s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median, total=   2.6s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median, total=   2.3s
[CV] model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median 
[CV]  model__max_depth=5, model__max_features=auto, model

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  2.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    

In [14]:
gs_model.score(X_test,y_test)

0.2576728947657382