#### Import necessary packages

In [42]:
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.metrics import make_scorer
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

#### Load the datasets

In [3]:
X_train=pd.read_csv("X_train.csv")
y_train=pd.read_csv("y_train.csv")

In [4]:
X_val=pd.read_csv("X_val.csv")
y_val=pd.read_csv("y_val.csv")

In [16]:
X_test=pd.read_csv("X_test.csv")
y_test=pd.read_csv("y_test.csv")

In [17]:
sample_submission1=pd.read_csv("recruit-restaurant-visitor-forecasting (1)/sample_submission.csv/sample_Submission.csv")

In [5]:
def rmsle_score(y,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y

    Args:
        y_pred - numpy array containing log of predictions 
        y - numpy array containing log of targets 
    """
    # to get anti- logs
    y = np.expm1(y)
    y_pred = np.expm1(y_pred)
    
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y + 1)).mean())

# scoring function
score = make_scorer(rmsle_score, greater_is_better=False)

#### Used GridSearchCV to find the best hyperparameters

In [7]:
# hyperparameters
parameters = {'learning_rate':[0.1,0.01,0.001],
              'min_child_weight':[0.001,0.002,0.05,0.8,0.9,1],
              'subsample':[0.5,0.6,0.7,1.0],
              'colsample_bytree':[0.3,0.4,0.5],
             }


# model
lgb_reg = LGBMRegressor()

# cross validation
lgb_reg_cv = GridSearchCV(estimator=lgb_reg,
                     param_grid=parameters,
                     cv=15,
                     return_train_score=True,
                     n_jobs=-1,
                     scoring=score)
lgb_reg_cv.fit(X=X_train,y=np.log1p(y_train['visitors']))

GridSearchCV(cv=15, error_score=nan,
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0,
                                     silent='wa...
                                     subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.3, 0.4, 0.5],
                         'learning_rate': [0.1, 0.01, 0.001],
                         'min_chil

In [8]:
pd.DataFrame(lgb_reg_cv.cv_results_).to_csv('lgb_reg_cv.csv',index=False)

In [9]:
lgb_reg_cv.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'min_child_weight': 0.001,
 'subsample': 0.5}

In [12]:
# Fit the model with the best hyperparametes
lgb_reg = LGBMRegressor(colsample_bytree= 0.5,
                         learning_rate= 0.1,
                         min_child_weight= 0.001,
                         subsample= 0.5)
lgb_reg.fit(X=X_train,
            y=np.log1p(y_train['visitors']))

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
              subsample=0.5, subsample_for_bin=200000, subsample_freq=0)

In [28]:
y_pred=lgb_reg.predict(X_train)
y_pred_val=lgb_reg.predict(X_val)

In [30]:
def rmsle(y_true,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y_true

    Args:
        y_pred - numpy array containing predictions 
        y_true - numpy array containing targets 
    """
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean())

#### Evaluation metrics for both train and validation data

In [31]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["LGBMRegressor", "RMSLE", "RMSE","MSE","MAE"]

x.add_row(["Train data",
           rmsle(y_true=(y_train['visitors']),y_pred=np.expm1(y_pred)),
           math.sqrt(mse(np.expm1(y_pred),(y_train['visitors']))),
           mse(np.expm1(y_pred),(y_train['visitors'])),
           mae(np.expm1(y_pred),(y_train['visitors']))])
x.add_row(["Validation data",
           rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(y_pred_val)),
           math.sqrt(mse(np.expm1(y_pred_val),(y_val['visitors']))),
           mse(np.expm1(y_pred_val),(y_val['visitors'])),
           mae(np.expm1(y_pred_val),(y_val['visitors']))])

print(x)

+-----------------+---------------------+--------------------+--------------------+-------------------+
|  LGBMRegressor  |        RMSLE        |        RMSE        |        MSE         |        MAE        |
+-----------------+---------------------+--------------------+--------------------+-------------------+
|    Train data   | 0.49692104517030755 | 10.800775045372182 | 116.65674158073446 | 6.737899671655249 |
| Validation data |  0.5119397271359493 | 11.557242969839622 | 133.56986506390737 | 6.877565602335381 |
+-----------------+---------------------+--------------------+--------------------+-------------------+


#### Saving the model with pickle

In [43]:
filename = 'lgb_model.sav'
pickle.dump(lgb_reg, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_val)
print(rmsle(y_true=(y_val['visitors']),
            y_pred=np.expm1(result)))

0.5119397271359493


#### Predict for the sample submission data and store it in the .csv file for submission in the kaggle

In [44]:
y_test_pred=lgb_reg.predict(X_test)
sample_submission1['visitors']=(np.expm1(y_test_pred)).astype(int)
sample_submission1.to_csv("Sub_lgb.csv",index=False)