In [2]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
import seaborn as sns

In [4]:
df=sns.load_dataset('tips')

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
X=df.iloc[:,1:]
y=df['total_bill']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [18]:
## Pieplining
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler(with_mean=False)),
    ]
)

In [19]:
from sklearn import set_config

In [20]:
set_config(display='diagram')

In [21]:
numeric_preprocessor

In [22]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [23]:
preprocessor=Pipeline(
    steps=[("categorical",categorical_preprocessor),("numerical",numeric_preprocessor)]

)

In [24]:
preprocessor

In [25]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [26]:
pipe

In [27]:
pipe.fit(X_train,y_train)

In [28]:
pipe.predict(X_train)

array([11.99381476, 17.00800857, 17.8905    , 15.6754    , 13.45985   ,
       11.2465    , 12.09888   , 26.37066333, 13.32662667, 17.6671    ,
       14.37736667, 34.1225    , 13.481115  , 18.32921   , 13.3457    ,
       18.60443333, 15.1202    , 28.04366   , 12.5968    , 19.03246667,
       27.3577    , 11.24555   , 13.721425  , 26.46386333, 30.35666   ,
       17.0751    , 19.58246667, 17.41085167, 24.1508319 , 14.99195333,
       12.9746    , 27.71417667, 12.82863333, 28.597     , 19.461     ,
       18.10568333, 12.3327    ,  7.3938    , 21.406     , 26.9642    ,
       13.13967333, 11.9527    , 41.4601    ,  9.17466667, 12.26943571,
       31.9176    , 11.52658333, 18.4225    , 15.7853    , 20.6333019 ,
       21.5881    , 16.99473333, 12.87025   , 11.1457    , 15.58284167,
       25.6775    , 12.1814    , 20.94362   , 23.3018    , 12.41167   ,
       16.69623333, 32.3644    , 13.25136667, 11.7829    , 19.0635    ,
       20.3253    , 21.137     , 33.461     , 21.57916667, 17.42

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
#Hperparameter Tuning
param_grid = {
    'regressor__n_estimators': [200,500],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth': [4,5,6,7,8]
}

In [31]:
grid_search = GridSearchCV(pipe,param_grid=param_grid, n_jobs=1)

In [32]:
grid_search.fit(X_train,y_train)

In [33]:
grid_search.best_params_

{'regressor__max_depth': 8,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 200}

In [35]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth= 8, max_features= 'sqrt', n_estimators= 200))]
)

In [36]:
pipe.fit(X_train,y_train)

In [37]:
pipe.predict(X_train)

array([14.59208226, 17.6591216 , 16.32758672, 16.35956246, 16.76421178,
       13.85427207, 15.22672762, 25.3279354 , 16.60924412, 21.78701963,
       17.25389265, 32.70325266, 17.35644036, 16.25051842, 16.75156985,
       17.92283017, 17.93380816, 24.49211499, 14.202116  , 18.11081809,
       26.20410658, 17.93133837, 16.84263015, 25.08342934, 26.09034321,
       16.64806315, 17.53652281, 16.15065759, 19.33816074, 16.73865389,
       15.80229013, 25.11084629, 17.51387743, 26.81257574, 18.27794524,
       17.09254934, 16.85361828, 10.8025497 , 22.46638421, 22.83212777,
       15.4863703 , 16.44592023, 36.16517404, 15.71528032, 15.07191121,
       28.0812595 , 15.61539172, 21.6095277 , 17.06753199, 23.14990767,
       21.77351342, 17.34155732, 14.59798626, 15.28226621, 17.37764589,
       25.34477005, 16.38489618, 24.32312658, 21.91133286, 15.95239202,
       17.33556014, 24.01377135, 15.87675159, 16.45152575, 21.76475381,
       21.6962251 , 18.3132151 , 29.36192515, 17.33870393, 19.03