#### Import necessary packages

In [73]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
import pickle

#### Load the datasets

In [4]:
X_train=pd.read_csv("X_train.csv")
y_train=pd.read_csv("y_train.csv")

In [19]:
X_val=pd.read_csv("X_val.csv")
y_val=pd.read_csv("y_val.csv")

In [20]:
X_test=pd.read_csv("X_test.csv")
y_test=pd.read_csv("y_test.csv")

In [67]:
sample_submission1=pd.read_csv("recruit-restaurant-visitor-forecasting/sample_submission.csv/sample_Submission.csv")

In [21]:
def rmsle_score(y,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y

    Args:
        y_pred - numpy array containing log of predictions 
        y - numpy array containing log of targets 
    """
    # to get anti- logs
    y = np.expm1(y)
    y_pred = np.expm1(y_pred)
    
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y + 1)).mean())

# scoring function
score = make_scorer(rmsle_score, greater_is_better=False)

#### Used HalvingGridSearchCV to find best hyperparameters

In [59]:
params = {
    'n_estimators' : [50,100,150,200],
    'max_depth':[1,5,10,50,100,150],
    'min_samples_split': [2,3,4,5,6]
}
model=RandomForestRegressor()
grid_kn = HalvingGridSearchCV(estimator = model,
                        param_grid = params, 
                        cv = 5,
                        scoring=score,
                        verbose = 1,
                        n_jobs = -1)
grid_kn.fit(X_train, np.log1p(y_train['visitors']))
pd.DataFrame(grid_kn.cv_results_).to_csv('RandomForestRegressor_cv_result.csv')

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 2908
max_resources_: 235552
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 120
n_resources: 2908
Fitting 5 folds for each of 120 candidates, totalling 600 fits
----------
iter: 1
n_candidates: 40
n_resources: 8724
Fitting 5 folds for each of 40 candidates, totalling 200 fits
----------
iter: 2
n_candidates: 14
n_resources: 26172
Fitting 5 folds for each of 14 candidates, totalling 70 fits
----------
iter: 3
n_candidates: 5
n_resources: 78516
Fitting 5 folds for each of 5 candidates, totalling 25 fits
----------
iter: 4
n_candidates: 2
n_resources: 235548
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [60]:
rf_cv.best_params_

{'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 200}

In [61]:
model=RandomForestRegressor(max_depth=5,
                            min_samples_split=5,
                            n_estimators=200)
model.fit(X=X_train,y=np.log1p(y_train['visitors']))

RandomForestRegressor(max_depth=5, min_samples_split=5, n_estimators=200)

In [62]:
y_pred=model.predict(X_train)

In [63]:
y_pred_val=model.predict(X_val)

In [47]:
def rmsle(y_true,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y_true

    Args:
        y_pred - numpy array containing predictions 
        y_true - numpy array containing targets 
    """
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean())

#### Evaluation metrics for both train and validation data

In [64]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["RandomForestRegressor", "RMSLE", "RMSE","MSE","MAE"]

x.add_row(["Train data",rmsle(y_true=(y_train['visitors']),y_pred=np.expm1(y_pred)),
           math.sqrt(mse(np.expm1(y_pred),(y_train['visitors']))),mse(np.expm1(y_pred),
            (y_train['visitors'])),mae(np.expm1(y_pred),(y_train['visitors']))])
x.add_row(["Validation data",rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(y_pred_val)),
           math.sqrt(mse(np.expm1(y_pred_val),(y_val['visitors']))),mse(np.expm1(y_pred_val),
            (y_val['visitors'])),mae(np.expm1(y_pred_val),(y_val['visitors']))])

print(x)
    

+-----------------------+--------------------+--------------------+--------------------+-------------------+
| RandomForestRegressor |       RMSLE        |        RMSE        |        MSE         |        MAE        |
+-----------------------+--------------------+--------------------+--------------------+-------------------+
|       Train data      | 0.5193416015185925 | 11.182838291255965 | 125.05587224838064 | 7.065978285033292 |
|    Validation data    | 0.5137078160685651 | 11.837433644103967 | 140.12483527856455 | 7.060547736492576 |
+-----------------------+--------------------+--------------------+--------------------+-------------------+


#### Saving the model using pickle

In [69]:
filename = 'rf_model.sav'
pickle.dump(model, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_val)
print(rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(result)))

0.5137078160685651


#### Predict for the sample submission data and store it in the .csv file for submission in the kaggle

In [24]:
y_test_pred=model.predict(X_test)
sample_submission1['visitors']=(np.expm1(y_test_pred)).astype(int)
sample_submission1.to_csv("Sub_rf.csv",index=False)