In [1]:
#Hyperparameter Tuning in Pipelining Machine Learning
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [2]:
import seaborn as sns


In [3]:
df = sns.load_dataset('tips'
                      )

In [4]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
X = df.iloc[:,1:]
y = df['total_bill']

In [6]:
from sklearn.model_selection import train_test_split


In [7]:
#random_state is a parameter that controls the random number generator used to shuffle data before splitting it. 
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [8]:
#Pipelining
numeric_processor = Pipeline(
    steps=[("Imputation_mean",SimpleImputer(missing_values=np.nan, strategy="mean")),
           ("scaler",StandardScaler()),          
           ]
)

In [10]:
#Categorical Processor
from sklearn.preprocessing import OneHotEncoder
categorical_processor = Pipeline(
    steps=[("Imputation_constant",SimpleImputer(fill_value = 'missing', strategy="constant")),
           ("one_hot",OneHotEncoder(handle_unknown="ignore")),         
           ]
)

In [11]:
preprocessor = ColumnTransformer(
    [('categorical',categorical_processor,['sex','smoker','day']),
     ('numerical',numeric_processor,['tip','size'])
     ]
)

In [12]:
pipe = Pipeline(
    steps = [("preprocessor",preprocessor),('regressor',RandomForestRegressor())]
)

In [13]:
from sklearn import set_config

In [14]:
set_config(display = 'diagram')

In [15]:
pipe

In [16]:
pipe.fit(X_train, y_train)

In [17]:
pipe.predict(X_test)

array([18.3725    , 13.38390381, 20.401     , 28.02382   , 12.98980694,
       14.03921881, 15.9754    , 15.39701429, 20.9336    , 19.78879167,
       19.1521    , 13.6721    , 10.51614   , 14.03921881, 10.89916262,
       15.1041    , 21.4478    , 18.9065    , 14.283705  , 25.6292    ,
       21.6746    , 20.2098    , 20.9       , 13.6721    , 25.4179    ,
       16.65545   , 14.05722   , 25.5741    , 20.401     , 23.6381    ,
       22.8276    , 13.4624    , 19.1993    , 18.83113333, 21.4062    ,
       21.418     , 12.9021    , 29.2938    , 18.26641429, 13.86439667,
       13.1508    , 11.90993755, 15.79857   , 15.5675    , 14.61315   ,
       11.19171667, 19.19892917, 17.6045    , 11.33606667])

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
##Hyperparamter Tuning
param_grid = {
    'regressor__n_estimators':[200,500],
    'regressor__max_features':['auto','sqrt','log2'],
    'regressor__max_depth':[4,5,6,7,8]

}

In [31]:
grid_search = GridSearchCV(pipe,param_grid=param_grid, n_jobs=-1)

In [32]:
grid_search.fit(X_train, y_train)

In [26]:
grid_search.best_params_


{'regressor__n_estimators': 500}

In [33]:
pipe = Pipeline(
    steps = [("preprocessor",preprocessor),('regressor',RandomForestRegressor(n_estimators = 500))]
)

In [34]:
pipe.fit(X_train,y_train)

In [35]:
pipe.predict(X_test)

array([17.47795333, 13.54884767, 20.089792  , 28.729116  , 12.95215627,
       14.322464  , 15.84898457, 15.2418014 , 21.10346   , 20.21347748,
       18.75902   , 13.70962   , 10.504412  , 14.322464  , 10.97926062,
       13.8196    , 22.0763    , 19.19176   , 14.56568555, 26.33448   ,
       20.33222   , 20.59248   , 20.33396   , 13.76966   , 23.52004   ,
       16.21761633, 13.41360686, 24.91924   , 20.089792  , 24.41578   ,
       23.0563    , 13.57634   , 19.03952   , 18.50346   , 21.07706   ,
       21.82734   , 13.156505  , 29.64328   , 18.47690286, 14.27319167,
       13.79654   , 12.08664737, 15.74776057, 14.16572   , 14.4786789 ,
       10.45471667, 18.81546548, 17.80012   , 11.182274  ])