# Selecting Best models with best hyperparameters

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

#Metrics
from sklearn.metrics import mean_squared_error, r2_score

# grid search
from sklearn.model_selection import GridSearchCV

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df=sns.load_dataset('Tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [4]:
# Designing the pipline
# 1. Encoding the categorical variables
Cat_features = df.select_dtypes(include=['object']).columns
for feature in Cat_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
models = { 
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor(),          
          'AdaBoostRegressor' : AdaBoostRegressor()
          }
best_model = None
best_model_accuracy = 0
model_scores = []
X_train, X_test, y_train, y_test = train_test_split(df.drop('tip', axis=1), df['tip'], test_size=0.2, random_state=42)
for model_name, model in models.items():
    pipe = model.fit(X_train, y_train)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    metric = mean_squared_error(y_test, y_pred)
    #print(model_name, metric)
    model_scores.append((model_name, metric))
sorts=sorted(model_scores, key=lambda x: x[1])
for sort in sorts:
    print('Mean Squared Error of', sort[0], 'is', sort[1])


Mean Squared Error of SVR is 0.538321847289585
Mean Squared Error of LinearRegression is 0.6948129686287711
Mean Squared Error of XGBRegressor is 0.7389215578875857
Mean Squared Error of GradientBoostingRegressor is 0.7996230587111121
Mean Squared Error of KNeighborsRegressor is 0.8382265306122448
Mean Squared Error of RandomForestRegressor is 0.9246167877551036
Mean Squared Error of AdaBoostRegressor is 1.0509067980637528
Mean Squared Error of DecisionTreeRegressor is 1.4656714285714285


# Best model with best hyperparameters

In [7]:
import pickle
import warnings
warnings.filterwarnings("ignore")

In [18]:
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          #'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }
best_model=[]
for name, (model,params) in models.items():
    grid = GridSearchCV(model, params, cv=5)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    print(name, 'MSE',mean_squared_error(y_test, y_pred))
    print(name, 'R2',r2_score(y_test, y_pred))
    best_model.append((name, grid.best_estimator_, mean_squared_error(y_test, y_pred)))


MSE = 2
model = None
for i in range(len(best_model)):
    if best_model[i][2] < MSE:
        MSE = best_model[i][2]
        model = best_model[i][1]

print('Best Model is', model)

LinearRegression MSE 0.6948129686287711
LinearRegression R2 0.4441368826121931
DecisionTreeRegressor MSE 0.8774153020453995
DecisionTreeRegressor R2 0.2980516670532908
RandomForestRegressor MSE 0.8754172657990968
RandomForestRegressor R2 0.2996501326932105
KNeighborsRegressor MSE 0.6437675304097399
KNeighborsRegressor R2 0.4849741693324664
GradientBoostingRegressor MSE 0.8366672932516618
GradientBoostingRegressor R2 0.33065082138418034
XGBRegressor MSE 0.7601696611425505
XGBRegressor R2 0.3918503299956485
Best Model is KNeighborsRegressor(n_neighbors=21, weights='distance')


In [20]:
# saving the best model in pickle
filename = './saved_models/best_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [22]:
# loading the model
X_train, X_test, y_train, y_test = train_test_split(df.drop('tip', axis=1), df['tip'], test_size=0.2, random_state=101)
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
y_pred = loaded_model.predict(X_test)
print('MSE',mean_squared_error(y_test, y_pred))
print('R2',r2_score(y_test, y_pred))
print(result)

MSE 0.09437930712170399
R2 0.9605814130306976
0.9605814130306976
