# How Pipeline and GridSearchCV work together

In [53]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

import pandas as pd

## Load and Prepare Training Set

In [35]:
train_df = pd.read_csv('Datasets\Train.csv')
train_df.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,ID_0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,ID_005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,ID_0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,ID_00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1


Fix column data types

In [36]:
date_cols = ['Policy Start Date', 'Policy End Date', 'First Transaction Date']
num_cols = ['Age', 'No_Pol']
cat_cols = ['Gender', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName']

for col in train_df.columns:
    if col in date_cols:
        train_df[col] = pd.to_datetime(train_df[col])
    elif col in cat_cols:
        train_df[col] = train_df[col].astype('category')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   ID                      12079 non-null  object        
 1   Policy Start Date       12079 non-null  datetime64[ns]
 2   Policy End Date         12079 non-null  datetime64[ns]
 3   Gender                  11720 non-null  category      
 4   Age                     12079 non-null  int64         
 5   First Transaction Date  12079 non-null  datetime64[ns]
 6   No_Pol                  12079 non-null  int64         
 7   Car_Category            8341 non-null   category      
 8   Subject_Car_Colour      5117 non-null   category      
 9   Subject_Car_Make        9603 non-null   category      
 10  LGA_Name                5603 non-null   category      
 11  State                   5591 non-null   category      
 12  ProductName             12079 non-null  catego

Map Genders to Other category

In [37]:
mapper = {'Entity': 'Other', 'Joint Gender': 'Other', 'NOT STATED': 'Other', 'NO GENDER': 'Other', 'SEX': 'Other'}
train_df['Gender'] = train_df['Gender'].replace(mapper)

# Confirm mappings
train_df['Gender'].value_counts(dropna=False)

Male      7617
Female    3327
Other      776
NaN        359
Name: Gender, dtype: int64

Drop date columns for this exercise

In [38]:
train_df.drop(columns=date_cols, inplace=True)
train_df.set_index('ID', inplace=True)
train_df.head()

Unnamed: 0_level_0,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ID_0040R73,Male,30,1,Saloon,Black,TOYOTA,,,Car Classic,0
ID_0046BNK,Female,79,1,JEEP,Grey,TOYOTA,,,Car Classic,1
ID_005QMC3,Male,43,1,Saloon,Red,TOYOTA,,,Car Classic,0
ID_0079OHW,Male,2,1,,,,,,CarSafe,0
ID_00BRP63,Other,20,3,,,,Lagos,Lagos,Muuve,1


Separate target variable from data

In [39]:
y = train_df['target']
X = train_df.drop(columns=['target'])
X.head()

Unnamed: 0_level_0,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ID_0040R73,Male,30,1,Saloon,Black,TOYOTA,,,Car Classic
ID_0046BNK,Female,79,1,JEEP,Grey,TOYOTA,,,Car Classic
ID_005QMC3,Male,43,1,Saloon,Red,TOYOTA,,,Car Classic
ID_0079OHW,Male,2,1,,,,,,CarSafe
ID_00BRP63,Other,20,3,,,,Lagos,Lagos,Muuve


## Create Preprocessors

Preprocessors for different data types within the same dataset can be created and bundled together

In [40]:
# Create preprocessor for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

Categorical data can sometimes require multiple steps for preprocessing, eg. Imputation then encoding. Use **Pipeline** to bundle these together. You will essentially be calling a pipeline within a pipeline later when doing the training and testing

In [41]:
# Create preprocessor for categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

**ColumnTransformer** allows you to bundle each preprocessor together and apply each one only to a specific set of columns that you specify

In [42]:
# Bundle preprocessors for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, num_cols), 
                                               ('cat', categorical_transformer, cat_cols)], 
                                               n_jobs=-1)

Normalization is done last as it applies to all preprocessed columns

In [45]:
# Create normalization preprocessor
normalization = StandardScaler()

Create the model and set parameter values we want to test

In [46]:
from sklearn.linear_model import LogisticRegression

# Have to give the parameter the name of the model in the pipeline + __parameter name
# That's how the GridSearchCV will know which estimator to apply it to from 'preprocessor', 'normalization', etc.
parameters = {'model__C':[0.01, 0.1, 1, 10, 20, 50, 100]}

model = LogisticRegression(n_jobs=-1)

Create Pipeline with preprocessing, normalization and model

In [47]:
# Bundle preprocessors and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('normalization', normalization), ('model', model)], 
                       verbose=True)

## Create classifier using GridSearchCV

The benefit of using GridSearchCV with the Pipeline as the estimator is that it handles Cross Validation as well as allows you to do parameter tuning, while at the same time avoiding data leakage by fitting the preprocessors only on the training set split but using those fit models to transform the test set split separately, ie: not fitting the preprocessors on the test set split

In [48]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(estimator=my_pipeline, param_grid=parameters, scoring='f1', n_jobs=-1)

## Train and test the classifier

**fit** will accomplish both the training and testing on the relevant training and testing sets

In [49]:
clf.fit(X, y)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.1s
[Pipeline] ..... (step 2 of 3) Processing normalization, total=   0.1s
[Pipeline] ............. (step 3 of 3) Processing model, total=   5.2s


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=-1,
                                                          transformers=[('num',
                                                                         SimpleImputer(),
                                                                         ['Age',
                                                                          'No_Pol']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignor

## View Results

In [50]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.09744,0.06704,0.156603,0.018038,0.01,{'model__C': 0.01},0.117318,0.123894,0.135385,0.128834,0.172619,0.13561,0.019429,7
1,2.930864,0.103889,0.169631,0.038824,0.1,{'model__C': 0.1},0.121212,0.132184,0.151515,0.126888,0.176991,0.141758,0.020354,6
2,4.179509,0.162854,0.153841,0.020935,1.0,{'model__C': 1},0.121212,0.14245,0.151515,0.126888,0.176991,0.143811,0.019795,4
3,4.993836,0.193062,0.194853,0.08138,10.0,{'model__C': 10},0.121212,0.142045,0.151515,0.126506,0.176991,0.143654,0.019867,5
4,4.636549,0.765661,0.100982,0.051115,20.0,{'model__C': 20},0.126374,0.142045,0.151515,0.132132,0.176991,0.145812,0.017798,1
5,3.948898,0.285169,0.085959,0.059152,50.0,{'model__C': 50},0.126374,0.142045,0.151515,0.132132,0.176991,0.145812,0.017798,1
6,2.639292,0.88875,0.038583,0.012978,100.0,{'model__C': 100},0.126374,0.142045,0.151515,0.132132,0.176991,0.145812,0.017798,1


You can use **clf.best_estimator_** to perform predictions directly. It's basically the one ranked 1 above, with the "best" parameter value

In [52]:
clf.best_estimator_

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('num', SimpleImputer(),
                                                  ['Age', 'No_Pol']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Gender', 'Car_Category',
                                                   'Subject_Car_Colour',
                                                   'Subject_Car_Make',
                              