#### Import necessary packages

In [56]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import make_scorer
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
import pickle

#### Laod the datasets

In [20]:
X_train=pd.read_csv("X_train.csv")
y_train=pd.read_csv("y_train.csv")

In [26]:
X_val=pd.read_csv("X_val.csv")
y_val=pd.read_csv("y_val.csv")

In [34]:
X_test=pd.read_csv("X_test.csv")
y_test=pd.read_csv("y_test.csv")

In [59]:
sample_submission1=pd.read_csv("recruit-restaurant-visitor-forecasting/sample_submission.csv/sample_Submission.csv")

In [61]:
def rmsle_score(y,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y

    Args:
        y_pred - numpy array containing log of predictions 
        y - numpy array containing log of targets 
    """
    # to get anti- logs
    y = np.expm1(y)
    y_pred = np.expm1(y_pred)
    
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y + 1)).mean())

# scoring function
score = make_scorer(rmsle_score, greater_is_better=False)

#### Used GridSearchCV to find best hyperparameters

In [22]:
# hyperparameters
parameters = {'learning_rate':[0.1,0.01,0.001],
              'min_child_weight':[0.8,0.9,1],
              'subsample':[0.5,0.6,0.7],
              'colsample_bytree':[0.3,0.4,0.5],
              'max_depth': [2,4,8]}

# XGBoost optimized matrix
train_matrix = xgb.DMatrix(
                    data=X_train,
                    label=np.log1p(y_train['visitors']))
#val_matrix = xgb.DMatrix(data=X_val,label=np.log1p(y_val))

# model
xgb_reg = xgb.XGBRegressor(tree_method='hist')

# cross validation
xgb_reg_cv = GridSearchCV(estimator=xgb_reg,
                     param_grid=parameters,
                     cv=15,
                     return_train_score=True,
                     n_jobs=-1,
                     scoring=score)
xgb_reg_cv.fit(X=X_train,y=np.log1p(y_train['visitors']))

GridSearchCV(cv=15, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_c...
                                    validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5],
                         'learning_rate': [0.1, 0.01, 0.001],
                         'max_depth': [2, 4, 8]

In [23]:
pd.DataFrame(xgb_reg_cv.cv_results_).to_csv('xgb_reg_cv.csv',index=False)

In [24]:
xgb_reg_cv.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 4,
 'min_child_weight': 0.8,
 'subsample': 0.5}

In [27]:
# best parameters
parameters = {'objective':'reg:squarederror', # objective for regression
              'eval_metric':'rmse', # root mean squared error
              'learning_rate':0.1, 
              'min_child_weight':0.8,
              'subsample':0.5,
              'colsample_bytree':0.5,
              'max_depth': 4, 
              'tree_method':'hist'} # GPU optimized training

# optimized xgb.DMatrix
train_matrix = xgb.DMatrix(
                data=X_train,
                label=np.log1p(y_train['visitors']))
val_matrix = xgb.DMatrix(
                data=X_val,
                label=np.log1p(y_val['visitors']))

# training model
model = xgb.train(
            params=parameters,
            dtrain=train_matrix,
            evals=[(train_matrix,'train'),(val_matrix,'test')],
            num_boost_round =1000)

[0]	train-rmse:2.20746	test-rmse:2.22160
[1]	train-rmse:1.99992	test-rmse:2.01122
[2]	train-rmse:1.81469	test-rmse:1.82295
[3]	train-rmse:1.65294	test-rmse:1.65983
[4]	train-rmse:1.50933	test-rmse:1.51624
[5]	train-rmse:1.38606	test-rmse:1.39090
[6]	train-rmse:1.26855	test-rmse:1.27105
[7]	train-rmse:1.16884	test-rmse:1.17151
[8]	train-rmse:1.07703	test-rmse:1.07797
[9]	train-rmse:1.00037	test-rmse:1.00162
[10]	train-rmse:0.93522	test-rmse:0.93648
[11]	train-rmse:0.87240	test-rmse:0.87187
[12]	train-rmse:0.82351	test-rmse:0.82216
[13]	train-rmse:0.77971	test-rmse:0.77863
[14]	train-rmse:0.73809	test-rmse:0.73548
[15]	train-rmse:0.70292	test-rmse:0.69758
[16]	train-rmse:0.67275	test-rmse:0.66629
[17]	train-rmse:0.65073	test-rmse:0.64441
[18]	train-rmse:0.62883	test-rmse:0.62204
[19]	train-rmse:0.61284	test-rmse:0.60700
[20]	train-rmse:0.59978	test-rmse:0.59456
[21]	train-rmse:0.58870	test-rmse:0.58287
[22]	train-rmse:0.57738	test-rmse:0.57165
[23]	train-rmse:0.56729	test-rmse:0.56118
[2

[383]	train-rmse:0.49639	test-rmse:0.51397
[384]	train-rmse:0.49636	test-rmse:0.51398
[385]	train-rmse:0.49635	test-rmse:0.51398
[386]	train-rmse:0.49633	test-rmse:0.51404
[387]	train-rmse:0.49632	test-rmse:0.51402
[388]	train-rmse:0.49628	test-rmse:0.51382
[389]	train-rmse:0.49625	test-rmse:0.51376
[390]	train-rmse:0.49624	test-rmse:0.51395
[391]	train-rmse:0.49622	test-rmse:0.51373
[392]	train-rmse:0.49620	test-rmse:0.51377
[393]	train-rmse:0.49619	test-rmse:0.51372
[394]	train-rmse:0.49618	test-rmse:0.51372
[395]	train-rmse:0.49617	test-rmse:0.51408
[396]	train-rmse:0.49613	test-rmse:0.51404
[397]	train-rmse:0.49612	test-rmse:0.51460
[398]	train-rmse:0.49609	test-rmse:0.51450
[399]	train-rmse:0.49605	test-rmse:0.51449
[400]	train-rmse:0.49603	test-rmse:0.51449
[401]	train-rmse:0.49601	test-rmse:0.51443
[402]	train-rmse:0.49599	test-rmse:0.51439
[403]	train-rmse:0.49597	test-rmse:0.51442
[404]	train-rmse:0.49595	test-rmse:0.51435
[405]	train-rmse:0.49593	test-rmse:0.51442
[406]	train

[763]	train-rmse:0.48957	test-rmse:0.51482
[764]	train-rmse:0.48956	test-rmse:0.51477
[765]	train-rmse:0.48955	test-rmse:0.51453
[766]	train-rmse:0.48955	test-rmse:0.51453
[767]	train-rmse:0.48954	test-rmse:0.51455
[768]	train-rmse:0.48953	test-rmse:0.51459
[769]	train-rmse:0.48950	test-rmse:0.51486
[770]	train-rmse:0.48949	test-rmse:0.51487
[771]	train-rmse:0.48948	test-rmse:0.51499
[772]	train-rmse:0.48945	test-rmse:0.51496
[773]	train-rmse:0.48943	test-rmse:0.51495
[774]	train-rmse:0.48943	test-rmse:0.51491
[775]	train-rmse:0.48942	test-rmse:0.51489
[776]	train-rmse:0.48941	test-rmse:0.51485
[777]	train-rmse:0.48938	test-rmse:0.51480
[778]	train-rmse:0.48937	test-rmse:0.51479
[779]	train-rmse:0.48936	test-rmse:0.51489
[780]	train-rmse:0.48935	test-rmse:0.51494
[781]	train-rmse:0.48934	test-rmse:0.51503
[782]	train-rmse:0.48933	test-rmse:0.51502
[783]	train-rmse:0.48926	test-rmse:0.51496
[784]	train-rmse:0.48926	test-rmse:0.51496
[785]	train-rmse:0.48925	test-rmse:0.51496
[786]	train

[954]	train-rmse:0.48723	test-rmse:0.51069
[955]	train-rmse:0.48722	test-rmse:0.51106
[956]	train-rmse:0.48721	test-rmse:0.51107
[957]	train-rmse:0.48720	test-rmse:0.51111
[958]	train-rmse:0.48718	test-rmse:0.51099
[959]	train-rmse:0.48717	test-rmse:0.51103
[960]	train-rmse:0.48715	test-rmse:0.51102
[961]	train-rmse:0.48713	test-rmse:0.51141
[962]	train-rmse:0.48712	test-rmse:0.51115
[963]	train-rmse:0.48711	test-rmse:0.51121
[964]	train-rmse:0.48710	test-rmse:0.51121
[965]	train-rmse:0.48709	test-rmse:0.51119
[966]	train-rmse:0.48709	test-rmse:0.51122
[967]	train-rmse:0.48707	test-rmse:0.51116
[968]	train-rmse:0.48707	test-rmse:0.51118
[969]	train-rmse:0.48706	test-rmse:0.51126
[970]	train-rmse:0.48706	test-rmse:0.51126
[971]	train-rmse:0.48706	test-rmse:0.51124
[972]	train-rmse:0.48704	test-rmse:0.51137
[973]	train-rmse:0.48704	test-rmse:0.51137
[974]	train-rmse:0.48702	test-rmse:0.51135
[975]	train-rmse:0.48701	test-rmse:0.51130
[976]	train-rmse:0.48700	test-rmse:0.51134
[977]	train

In [57]:
test_matrix = xgb.DMatrix(data=X_train)
y_pred=model.predict(test_matrix)
test_matrix1 = xgb.DMatrix(data=X_val)
y_pred_val=model.predict(test_matrix1)

In [53]:
def rmsle(y_true,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y_true

    Args:
        y_pred - numpy array containing predictions 
        y_true - numpy array containing targets 
    """
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean())

#### Evaluation metrics for both train and validation data

In [54]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["RandomForestRegressor", "RMSLE", "RMSE","MSE","MAE"]

x.add_row(["Train data",
           rmsle(y_true=(y_train['visitors']),y_pred=np.expm1(y_pred)),
           math.sqrt(mse(np.expm1(y_pred),(y_train['visitors']))),
           mse(np.expm1(y_pred),(y_train['visitors'])),
           mae(np.expm1(y_pred),(y_train['visitors']))])
x.add_row(["Validation data",
           rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(y_pred_val)),
           math.sqrt(mse(np.expm1(y_pred_val),(y_val['visitors']))),
           mse(np.expm1(y_pred_val),(y_val['visitors'])),
           mae(np.expm1(y_pred_val),(y_val['visitors']))])

print(x)
    

+-----------------------+---------------------+--------------------+--------------------+-------------------+
| RandomForestRegressor |        RMSLE        |        RMSE        |        MSE         |        MAE        |
+-----------------------+---------------------+--------------------+--------------------+-------------------+
|       Train data      | 0.48670822513080425 | 10.522259648449046 | 110.71794810937902 | 6.559753467617188 |
|    Validation data    |  0.5118067047187281 | 11.693365521506191 | 136.73479721954976 | 6.947069188295052 |
+-----------------------+---------------------+--------------------+--------------------+-------------------+


#### Save the model using pickle

In [58]:
filename = 'xgb_model.sav'
pickle.dump(model, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(test_matrix1)
print(rmsle(y_true=(y_val['visitors']),
            y_pred=np.expm1(result)))

0.5118067047187281


#### Predict for the sample submission data and store it in the .csv file for submission in the kaggle

In [36]:
test_matrix = xgb.DMatrix(data=X_test)
y_test_pred=model.predict(test_matrix)
sample_submission1['visitors']=(np.expm1(y_test_pred)).astype(int)
sample_submission1.to_csv("Sub_xg.csv",index=False)