#### Import necessary packages

In [26]:
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

#### Load the datasets

In [2]:
X_train=pd.read_csv("X_train.csv")
y_train=pd.read_csv("y_train.csv")

In [3]:
X_val=pd.read_csv("X_val.csv")
y_val=pd.read_csv("y_val.csv")

In [4]:
X_test=pd.read_csv("X_test.csv")
y_test=pd.read_csv("y_test.csv")

In [14]:
sample_submission1=pd.read_csv("recruit-restaurant-visitor-forecasting (1)/sample_submission.csv/sample_Submission.csv")

In [5]:
def rmsle_score(y,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y

    Args:
        y_pred - numpy array containing log of predictions 
        y - numpy array containing log of targets 
    """
    # to get anti- logs
    y = np.expm1(y)
    y_pred = np.expm1(y_pred)
    
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y + 1)).mean())

# scoring function
score = make_scorer(rmsle_score, greater_is_better=False)

#### Use GridSearchCV to find best parameters

In [6]:
dt = DecisionTreeRegressor()
# hyperparameters
parameters = {'max_depth': [1, 5, 10, 50],
              'min_samples_split' : [5, 10, 100, 500]}
# cross validation
dt_cv = GridSearchCV(estimator=dt,
                      param_grid=parameters,
                      scoring=score,
                      n_jobs=-1,
                      cv=3,
                      verbose=10,
                      return_train_score=True)
dt_cv.fit(X_train,np.log1p(y_train['visitors']))

# saving cv results

pd.DataFrame(dt_cv.cv_results_).to_csv('DecisionTreeRegressor_cv_result.csv')

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [7]:
# best parameters
dt_cv.best_params_

{'max_depth': 10, 'min_samples_split': 500}

In [8]:
dt = DecisionTreeRegressor(max_depth = 10, min_samples_split = 500)
dt.fit(X_train,np.log1p(y_train['visitors']))

DecisionTreeRegressor(max_depth=10, min_samples_split=500)

In [10]:
y_pred=dt.predict(X_train)
y_pred_val=dt.predict(X_val)

In [11]:
def rmsle(y_true,y_pred): 
    """
    Compute the Root Mean Squared Log Error for prediction y_pred and targets y_true

    Args:
        y_pred - numpy array containing predictions 
        y_true - numpy array containing targets 
    """
    return np.sqrt(np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean())

#### Evaluation metrics for both train and validation data

In [12]:
from prettytable import PrettyTable
    
x = PrettyTable()

x.field_names = ["RandomForestRegressor", "RMSLE", "RMSE","MSE","MAE"]

x.add_row(["Train data",rmsle(y_true=(y_train['visitors']),y_pred=np.expm1(y_pred)),
           math.sqrt(mse(np.expm1(y_pred),(y_train['visitors']))),mse(np.expm1(y_pred),
            (y_train['visitors'])),mae(np.expm1(y_pred),(y_train['visitors']))])
x.add_row(["Validation data",rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(y_pred_val)),
           math.sqrt(mse(np.expm1(y_pred_val),(y_val['visitors']))),mse(np.expm1(y_pred_val),
            (y_val['visitors'])),mae(np.expm1(y_pred_val),(y_val['visitors']))])

print(x)

+-----------------------+--------------------+--------------------+--------------------+-------------------+
| RandomForestRegressor |       RMSLE        |        RMSE        |        MSE         |        MAE        |
+-----------------------+--------------------+--------------------+--------------------+-------------------+
|       Train data      | 0.5088185216033662 | 11.032774420773825 | 121.7221114196812  | 6.901239921532613 |
|    Validation data    | 0.5084237807042076 | 11.760736426492095 | 138.31492129341802 | 6.971809489985008 |
+-----------------------+--------------------+--------------------+--------------------+-------------------+


#### Saving the model with pickle

In [19]:
filename = 'decisiontree_model.sav'
pickle.dump(dt, open(filename, 'wb'))

In [25]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_val)
print(rmsle(y_true=(y_val['visitors']),y_pred=np.expm1(result)))

0.5084237807042076


#### Predict for the sample submission data and store it in the .csv file for submission in the kaggle

In [13]:
y_test_pred=dt.predict(X_test)
sample_submission1['visitors']=(np.expm1(y_test_pred)).astype(int)
sample_submission1.to_csv("Sub_dt.csv",index=False)