In [1]:
from time import time
import datetime
from operator import itemgetter
import csv

import utils
import data_utils

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation as cv
from sklearn.grid_search import RandomizedSearchCV

from scipy.stats import randint as sp_randint

In [2]:
train, test, test_ids = data_utils.load_transformed_data()
X_train, y_train = data_utils.get_raw_values(train)

In [8]:
rfr_params = {'n_estimators': 100, 'random_state': 42, 'n_jobs': -1, 'oob_score': False, 'bootstrap': False, 'min_samples_leaf': 6, 'min_samples_split': 40, 'max_features': 13, 'max_depth': 54}
rfr_params = {'n_estimators': 15, 'random_state': 42, 'n_jobs': -1}

In [7]:
model = RandomForestRegressor(**rfr_params)
folds = cv.KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=42)
scores = cv.cross_val_score(model, X_train, y_train, scoring=utils.rmspe_scorer, cv=folds, n_jobs=-1)
print(scores)
print(scores.mean())

[-0.13866179 -0.25888405 -0.13056529 -0.21951262 -0.14487902]
-0.178500554819


In [10]:
train.dtypes

Store                    int64
DayOfWeek                int64
Sales                  float64
Open                     int64
Promo                    int64
SchoolHoliday            int64
woy                      int64
month                    int64
Seasonal_4_sin         float64
StateHoliday_0         float64
StateHoliday_a         float64
StateHoliday_b         float64
StateHoliday_c         float64
CompetitionDistance    float64
StoreType_a            float64
StoreType_b            float64
StoreType_c            float64
StoreType_d            float64
Assortment_a           float64
Assortment_b           float64
Assortment_c           float64
Sales_mean             float64
dtype: object

In [3]:
# Utility function to report best scores
def report(grid_scores, n_top=20):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(score.cv_validation_scores)
        print("Mean validation score: {0:.10f} (std: {1:.10f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print(score.parameters)
        print("")

In [4]:
n_features = X_train.shape[1]
model = RandomForestRegressor(n_estimators=15, random_state=42, n_jobs=-1)
# specify parameters and distributions to sample from
param_dist = {"max_depth": sp_randint(1, 3*n_features),
              "max_features": sp_randint(1, n_features),
              "min_samples_split": sp_randint(1, 3*n_features),
              "min_samples_leaf": sp_randint(1, 3*n_features),
              "bootstrap": [True, False],
              "oob_score": [True, False]
             }

# run randomized search
n_iter_search = 120
folds = cv.KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(model,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=folds,
                                   n_jobs=-1,
                                   scoring=utils.rmspe_scorer,
                                   iid=False,
                                   error_score=-99.99
                                  )
start = time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

RandomizedSearchCV took 6203.30 seconds for 120 candidates parameter settings.


ValueError('Out of bag estimation only available if bootstrap=True',)
ValueError('Out of bag estimation only available if bootstrap=True',)
ValueError('Out of bag estimation only available if bootstrap=True',)
ValueError('Out of bag estimation only available if bootstrap=True',)
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


In [5]:
report(random_search.grid_scores_)

Model with rank: 1
[-0.1444515  -0.15934088 -0.11348138 -0.11372803 -0.12519782]
Mean validation score: -0.1312399203 (std: 0.0180152656)
Parameters: {'oob_score': False, 'bootstrap': False, 'min_samples_leaf': 6, 'min_samples_split': 40, 'max_features': 13, 'max_depth': 54}
{'oob_score': False, 'bootstrap': False, 'min_samples_leaf': 6, 'min_samples_split': 40, 'max_features': 13, 'max_depth': 54}

Model with rank: 2
[-0.15546693 -0.12603342 -0.13571223 -0.24052428 -0.12645563]
Mean validation score: -0.1568384985 (std: 0.0431823627)
Parameters: {'oob_score': True, 'bootstrap': True, 'min_samples_leaf': 11, 'min_samples_split': 44, 'max_features': 13, 'max_depth': 66}
{'oob_score': True, 'bootstrap': True, 'min_samples_leaf': 11, 'min_samples_split': 44, 'max_features': 13, 'max_depth': 66}

Model with rank: 3
[-0.18844583 -0.13339544 -0.13829624 -0.24388025 -0.12644235]
Mean validation score: -0.1660920213 (std: 0.0446418404)
Parameters: {'oob_score': False, 'bootstrap': True, 'min_s