In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
import seaborn as sns

In [3]:
df=sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
X=df.iloc[:,1:]
y=df['total_bill']

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [7]:
## Pieplining
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

In [8]:
from sklearn import set_config

In [9]:
set_config(display='diagram')

In [10]:
numeric_preprocessor

In [11]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [16]:
preprocessor=ColumnTransformer(
    [("categorical",categorical_preprocessor,["sex","smoker","day","time"]),
    ("numerical",numeric_preprocessor,["tip","size"])]
)

In [17]:
preprocessor

In [18]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [19]:
pipe

In [20]:
pipe.fit(X_train,y_train)

In [21]:
pipe.predict(X_test)

array([12.0876    ,  6.4253    , 17.5531    , 12.8388    , 19.5128    ,
       19.77123333, 18.56355   , 19.03335   , 26.78125833, 16.49377381,
       40.1381    , 20.02892   , 41.7549    , 11.80903333, 15.7223    ,
       11.06113333, 10.47965333, 21.67282   , 29.0696    , 20.631075  ,
       14.40357048, 13.8256    , 32.3702    , 26.6349    , 24.96228667,
       13.8256    , 18.03964   , 19.90042   , 29.2419    , 30.9844    ,
       14.90473667, 21.83233333, 15.16403333, 12.89863667, 10.9867    ,
       30.0964    , 12.6516477 , 18.90096   , 21.8559    , 28.7311    ,
       16.801     , 20.0977    , 10.9983    , 24.6132    , 15.88863333,
       20.92531905, 22.779175  , 21.66651905, 11.62104667])

In [22]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
## Hypermeter Tuning
param_grid = {
    "regressor__n_estimators": [200, 500],
    "regressor__max_features": ["auto", "sqrt", "log2"],
    "regressor__max_depth": [4, 5, 6, 7, 8]
}

In [24]:
grid_search = GridSearchCV(pipe, param_grid = param_grid, n_jobs=1)

In [25]:
grid_search.fit(X_train, y_train)

In [26]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'log2',
 'regressor__n_estimators': 500}

In [27]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth = 5, max_features = "log2", n_estimators = 500))]
)

In [28]:
pipe.fit(X_train,y_train)

In [29]:
pipe.predict(X_test)

array([14.97919585, 13.53698637, 18.02431817, 17.04179455, 17.87109082,
       21.59231877, 16.74809274, 21.0195449 , 24.65954602, 17.35537403,
       29.05151568, 22.64586987, 35.51571748, 17.12574234, 17.03987715,
       12.39630682, 12.27425096, 19.30220493, 27.28383664, 19.69433022,
       16.75553678, 16.98715444, 27.05442066, 25.67539256, 24.53430761,
       16.98715444, 17.77079335, 19.07602268, 27.56128041, 27.13250075,
       15.62179513, 20.89963854, 17.72560836, 14.97058763, 14.37032951,
       32.14112514, 12.86319054, 16.35597227, 20.85399566, 26.4424758 ,
       18.09581212, 22.99820431, 14.70486718, 25.26051717, 14.9236427 ,
       21.29591138, 21.96848682, 21.34542683, 14.94736516])