In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer ## HAndle Missing Values
from sklearn.preprocessing import StandardScaler ## Feature Scaling
from sklearn.preprocessing import OneHotEncoder ## categorical to numerical
from sklearn.compose import ColumnTransformer # connecting pipelines 

In [2]:
df=pd.read_excel('tips.xlsx')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
X=df.drop(labels='total_bill',axis=1)
y=df['total_bill']

In [6]:
X

Unnamed: 0,tip,sex,smoker,day,time,size
0,1.01,Female,No,Sun,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.50,Male,No,Sun,Dinner,3
3,3.31,Male,No,Sun,Dinner,2
4,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,5.92,Male,No,Sat,Dinner,3
240,2.00,Female,Yes,Sat,Dinner,2
241,2.00,Male,Yes,Sat,Dinner,2
242,1.75,Male,No,Sat,Dinner,2


In [7]:
y

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=10)
X_train.shape, X_test.shape

((183, 6), (61, 6))

In [24]:
cat_cols=['sex', 'smoker', 'day','time']
num_cols=[ 'tip','size']


In [25]:
# numercial pipeline

num_pipeline=Pipeline(
     steps=[
     ('imputer',SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())
     
     
     ]

)

# categorical pipeline

cat_pipeline=Pipeline(
     steps=[
     ('imputer',SimpleImputer(strategy='most_frequent')),
     ('encoder', OneHotEncoder())
     
     
     ]

)

In [26]:
preprocessor=ColumnTransformer([
    
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)


])

In [27]:
preprocessor

In [28]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [29]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [30]:
models={
    'Gradient Boost':GradientBoostingRegressor(),
    'Random Forest Regressor':RandomForestRegressor()
    

}

In [31]:
from sklearn.metrics import r2_score

In [32]:
def evaluate_model(X_train,y_train,X_test,y_test,models):

    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        # Train model
        model.fit(X_train,y_train)



        # Predict Testing data
        y_test_pred =model.predict(X_test)

        # Get accuracy for test data prediction

        test_model_score = r2_score(y_test,y_test_pred)

        report[list(models.keys())[i]] =  test_model_score



    return report


In [33]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Gradient Boost': 0.3708544822994033,
 'Random Forest Regressor': 0.44705902350652615}

In [34]:
grad=GradientBoostingRegressor()

In [35]:
params={
    'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate':[0.1,0.01,1],
    'criterion':['friedman_mse', 'squared_error'],
    'n_estimators':[100,150,200]

}

In [36]:
from sklearn.model_selection import GridSearchCV

In [37]:
clf=GridSearchCV(grad,param_grid=params,cv=6,verbose=3)

In [38]:
clf.fit(X_train,y_train)

Fitting 6 folds for each of 72 candidates, totalling 432 fits
[CV 1/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.228 total time=   0.1s
[CV 2/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.396 total time=   0.1s
[CV 3/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.645 total time=   0.1s
[CV 4/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.554 total time=   0.1s
[CV 5/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.560 total time=   0.1s
[CV 6/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=100;, score=0.449 total time=   0.1s
[CV 1/6] END criterion=friedman_mse, learning_rate=0.1, loss=squared_error, n_estimators=150;, score=0.180 total time=   0.1s
[CV 2/6] END criterion=friedman_mse, learning_rate=0.1, 

In [39]:
clf.best_params_

{'criterion': 'friedman_mse',
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'n_estimators': 100}

In [44]:
clf.best_score_

0.47206951762350086

In [45]:
y_predict=clf.predict(X_test)

In [46]:
r2_score(y_test,y_predict)

0.3732289807250976