**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


**Import required modules as well**

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df= pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Tips.csv')

In [None]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
df.shape

(244, 7)

**Define X & y variables**

In [None]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [None]:
X= df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y= df['total_bill']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y, random_state=42, test_size=0.2)

**Pipeline**

**Imputation of missing value and Scaling**

In [None]:
numeric_preprocessor= Pipeline(
    steps=[
        ("imputation_mean",SimpleImputer(missing_values= np.nan, strategy='mean')),
        ('scaler', StandardScaler()),

     ])

In [None]:
categorical_preprocessor= Pipeline(
    steps=[
        ("imputation_constant",SimpleImputer(fill_value= 'missing', strategy='constant')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),

    ])

In [None]:
preprocessor= ColumnTransformer([
    ('categorical',categorical_preprocessor,['sex', 'smoker', 'day', 'time']),
    ('numerical', numeric_preprocessor,['tip', 'size'])

])

In [None]:
pipe= Pipeline(
    steps=[("Preprocessor",preprocessor),
           ('regressor', RandomForestRegressor())
       ])

In [None]:
from sklearn import set_config

In [None]:
set_config(display='diagram')

In [None]:
pipe

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred=pipe.predict(X_test)

**Hyper Tuning**

In [None]:
param_grid= {
    "regressor__n_estimators": [200,500],
    "regressor__max_features": ["auto","sqrt","log2"],
    "regressor__max_depth": [4,5,6,7,8]
}

In [None]:
grid_search= GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

{'regressor__max_depth': 6,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 500}

**The best parameters are this, hence we take this and use**

In [None]:
pipe= Pipeline(
    steps=[("Preprocessor",preprocessor),
           ('regressor', RandomForestRegressor(max_depth=6, max_features='sqrt', n_estimators=500))
       ])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred2= pipe.predict(X_test)