## Load preprocessed train and test datasets

In [7]:
from datetime import date
import pickle
import math
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import mean_absolute_error


#rng = np.random.RandomState(31337)

train = pickle.load(open("/codilime/data/train_processed_3.0.0_2017-07-07.p", "rb"))
test = pickle.load(open("/codilime/data/test_processed_2.0.0_05062017.p", "rb"))
train["y"][:12]


X = np.array(train["X"])
y = np.array(train["y"])

## Split train dataset into training and testing cross validation.

In [8]:
kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=5, random_state=None, shuffle=True)


In [9]:
errors = []

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    xgb_model = xgb.XGBRegressor().fit(X_train, y_train)
    predictions = xgb_model.predict(X_test)
    print(mean_absolute_error(y_test, predictions))
    errors.append(mean_absolute_error(y_test, predictions))
    
print("Mean absolute error: " ,np.mean(errors))

TRAIN: [    1     2     3 ..., 10882 10884 10885] TEST: [    0     4     9 ..., 10873 10876 10883]
0.305189827872
TRAIN: [    0     1     3 ..., 10881 10883 10885] TEST: [    2    14    17 ..., 10879 10882 10884]
0.314064299285
TRAIN: [    0     1     2 ..., 10883 10884 10885] TEST: [    8    16    34 ..., 10857 10866 10875]
0.322290898788
TRAIN: [    0     1     2 ..., 10883 10884 10885] TEST: [    3     6     7 ..., 10868 10878 10880]
0.303604978121
TRAIN: [    0     2     3 ..., 10882 10883 10884] TEST: [    1     5    11 ..., 10872 10881 10885]
0.318672654377
Mean absolute error:  0.312764531689


## Fine tune optimum number of estimators

In [10]:
ind_params = {"max_depth": 7, "gamma": 0, "min_child_weight": 1}

# grid search
model = xgb.XGBRegressor(**ind_params)
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, cv=kfold, verbose=1)
result = grid_search.fit(X, y)
print("Best: %f using %s" % (result.best_score_, result.best_params_))

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  1.5min finished


Best: 0.947566 using {'n_estimators': 200}


## Fine tune optimum min_child_weight and max_depth

In [11]:
ind_params = {"gamma": 0}

# grid search
model = xgb.XGBRegressor(**ind_params)
min_child_weight = range(1,6,2)
max_depth = range(3,10,2)
param_grid = dict(min_child_weight=min_child_weight, max_depth=max_depth)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, cv=kfold, verbose=1)
result = grid_search.fit(X, y)
print("Best: %f using %s" % (result.best_score_, result.best_params_))

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   52.7s finished


Best: 0.947290 using {'max_depth': 7, 'min_child_weight': 5}


## Fine tune gamma parameter

In [12]:
ind_params = {"max_depth": 7, "n_estimators": 200, "min_child_weight": 1}

[i/10.0 for i in range(0,5)]

# grid search
model = xgb.XGBRegressor(**ind_params)
gamma = [i/10.0 for i in range(0,5)]
param_grid = dict(gamma=gamma)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, cv=kfold, verbose=1)
result = grid_search.fit(X, y)
print("Best: %f using %s" % (result.best_score_, result.best_params_))

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   54.1s finished


Best: 0.947566 using {'gamma': 0.0}


## Find optimum subsample and colsample_bytree

In [13]:
ind_params = {"max_depth": 7, "n_estimators": 200, "gamma": 0, "min_child_weight": 1}


# grid search
model = xgb.XGBRegressor(**ind_params)
subsample = [i/10.0 for i in range(6,10)]
colsample_bytree = [i/10.0 for i in range(6,10)]

param_grid = dict(subsample=subsample, colsample_bytree=colsample_bytree)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, cv=kfold, verbose=1)
result = grid_search.fit(X, y)
print("Best: %f using %s" % (result.best_score_, result.best_params_))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  3.2min finished


Best: 0.947059 using {'colsample_bytree': 0.9, 'subsample': 0.8}


## Fine tune reg_alpha

In [14]:
ind_params = {"max_depth": 7, "n_estimators": 200, "gamma": 0, "min_child_weight": 1, 'colsample_bytree': 0.9, 'subsample': 0.8}

reg_alpha = [0, 0.001, 0.005, 0.01, 0.05]

# grid search
model = xgb.XGBRegressor(**ind_params)

param_grid = dict(reg_alpha=reg_alpha)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, cv=kfold, verbose=1)
result = grid_search.fit(X, y)
print("Best: %f using %s" % (result.best_score_, result.best_params_))

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished


Best: 0.947346 using {'reg_alpha': 0.005}


## Lower learning rate

In [16]:
ind_params = {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 5000, "gamma": 0, "min_child_weight": 1, 'colsample_bytree': 0.9, 'subsample': 0.8, 'reg_alpha': 0.005}

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    xgb_model = xgb.XGBRegressor(**ind_params).fit(X_train, y_train)
    
    predictions = xgb_model.predict(X_test)
    print(mean_absolute_error(y_test, predictions))
    errors.append(mean_absolute_error(y_test, predictions))
    
print("Mean absolute error: " ,np.mean(errors))

TRAIN: [    0     1     2 ..., 10883 10884 10885] TEST: [    4    15    16 ..., 10877 10878 10880]
0.230089603985
TRAIN: [    0     1     2 ..., 10881 10884 10885] TEST: [    6    10    11 ..., 10879 10882 10883]
0.229837604975
TRAIN: [    0     2     4 ..., 10882 10883 10885] TEST: [    1     3     5 ..., 10874 10881 10884]
0.235128045669
TRAIN: [    1     2     3 ..., 10883 10884 10885] TEST: [    0     8    18 ..., 10869 10873 10876]
0.236164135992
TRAIN: [    0     1     3 ..., 10882 10883 10884] TEST: [    2     7    12 ..., 10868 10875 10885]
0.228958699831
Mean absolute error:  0.265370877722


## Save predictions to a kaggle submission file

In [17]:
# Train xgboost model using the best parameters

ind_params = {"learning_rate": 0.01, "max_depth": 7, "n_estimators": 5000, "gamma": 0, "min_child_weight": 1, 'colsample_bytree': 0.9, 'subsample': 0.8, 'reg_alpha': 0.005}
xgb_model = xgb.XGBRegressor(**ind_params).fit(X,y)
predictions = xgb_model.predict(test["X"])
print(predictions[:10])

[ 2.84964871  1.4128989   1.02369332  0.85560167  0.37875053  1.75532472
  3.59586835  4.59034061  5.50044584  4.93733311]


In [22]:
df_test = pd.read_csv('/codilime/data/test.csv')
df_test[:5]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [23]:
submission_df = df_test[["datetime"]].copy() # copy columns from test dataframe

In [24]:
# Convert log(count) to count required by submission format 

predictions_non_zero = []
for value in predictions:
    value = math.exp(value)
    if value < 0:
        predictions_non_zero.append(0)
    else:
        predictions_non_zero.append(value)

submission_df["count"] = np.array(predictions_non_zero)

submission_df[:10]

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,17.28171
1,2011-01-20 01:00:00,4.107846
2,2011-01-20 02:00:00,2.783456
3,2011-01-20 03:00:00,2.35279
4,2011-01-20 04:00:00,1.460459
5,2011-01-20 05:00:00,5.785326
6,2011-01-20 06:00:00,36.447335
7,2011-01-20 07:00:00,98.527984
8,2011-01-20 08:00:00,244.801051
9,2011-01-20 09:00:00,139.397994


In [25]:
filename = '/codilime/data/submission_4.0.0_{}.csv'.format(date.today())
submission_df.to_csv(filename, index=False) # save results to a submission file

Results: Kaggle score: 0.40737