In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import xgboost as xgb  #GBM algorithm
from xgboost import XGBRegressor
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   # Perforing grid search

from IPython.display import display

from datetime import timedelta
import datetime as dt
from utils import data_utils, dataframe_util

# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
train, test = data_utils.load_dataset(op_scope='6')
print 'train: {}, test: {}'.format(train.shape, test.shape)

train: (1458644, 152), test: (625134, 151)


In [4]:
train.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)
test.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)

In [5]:
random_indexs = np.arange(0, train.shape[0], 10)
train = train.iloc[random_indexs, :]

In [6]:
train['trip_duration'] = np.log(train['trip_duration'])
y_train_all = train['trip_duration']
del train['id']
del train['trip_duration']
id_test = test['id']
del test['id']

In [14]:
train_X = train
train_Y = y_train_all
test_X = test

In [9]:
print 'train:', train.shape, ', test:', test.shape
train_rmses = []
val_rmses = []
num_boost_roundses = []

X_test = test
df_columns = train.columns.values

dtrain = xgb.DMatrix(train, y_train_all, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

train: (145865, 148) , test: (625134, 148)


In [10]:
import math

def common_num_range(start,stop,step):
    
    startlen = stoplen = steplen = 0
    if '.' in str(start):
        startlen = len(str(start)) - str(start).index('.') - 1
    if '.' in str(stop):
        stoplen = len(str(stop)) - str(stop).index('.') - 1
    if '.' in str(step):
        steplen = len(str(step)) - str(step).index('.') - 1
    
    maxlen = startlen
    if stoplen > maxlen:
        maxlen = stoplen
    if steplen > maxlen:
        maxlen = steplen
    
    power = math.pow(10, maxlen)
    
    if startlen == 0 and stoplen == 0 and steplen == 0:
        return range(start, stop, step)
    else:
        return [num / power for num in range(int(start*power), int(stop*power), int(step*power))]

## Parameters Tuning Plan
The overall parameters can be divided into 3 categories:

- General Parameters: Guide the overall functioning
- Booster Parameters: Guide the individual booster (tree/regression) at each step
- Learning Task Parameters: Guide the optimization performed

In XGBRegressor:
```
class xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
```

### Baseline XGBRegressor

In [69]:
xgb_params = {
    'eta': 0.01,
    'max_depth': 3,
    'subsample': 0.93,
    'gamma': 0,
    'min_child_weight': 1,
    'colsample_bytree': 1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    'scale_pos_weight': 1,

    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [56]:
cv_result = xgb.cv(dict(xgb_params),
                   dtrain,
                   num_boost_round=300,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                   )

[0]	train-rmse:5.95731	test-rmse:5.95732
[50]	train-rmse:3.62518	test-rmse:3.62525
[100]	train-rmse:2.22451	test-rmse:2.22475
[150]	train-rmse:1.39302	test-rmse:1.39353
[200]	train-rmse:0.912873	test-rmse:0.913843
[250]	train-rmse:0.648812	test-rmse:0.650737
[299]	train-rmse:0.524055	test-rmse:0.526594


In [57]:
train_rmse_mean = cv_result['train-rmse-mean'].values[-1]
test_rmse_mean = cv_result['test-rmse-mean'].values[-1]

print "CV RMSE : train_rmse_mean = %.7g | test_rmse_mean = %.7g" % (train_rmse_mean, test_rmse_mean)

CV RMSE : train_rmse_mean = 0.5240547 | test_rmse_mean = 0.5265943


In [70]:
def fine_tune_param_cv(xgb_params, param, values):
    print xgb_params
    print '===> fine tuning {}...'.format(param)
    min_test_rmse_mean = 100
    best_value = xgb_params[param]
    for value in values:
        print 'fine tuning {} = {}'.format(param, value)
        xgb_params[param] = value
        cv_result = xgb.cv(dict(xgb_params),
                       dtrain,
                       num_boost_round=300,
                       early_stopping_rounds=50,
                       verbose_eval=50,
                       show_stdv=False
                       )
        train_rmse_mean = cv_result['train-rmse-mean'].values[-1]
        test_rmse_mean = cv_result['test-rmse-mean'].values[-1]
        print "CV RMSE : train_rmse_mean = %.7g | test_rmse_mean = %.7g" % (train_rmse_mean, test_rmse_mean)
        if test_rmse_mean < min_test_rmse_mean:
            min_test_rmse_mean = test_rmse_mean
            best_value = value
    
    print 'best {} = {}, min_test_rmse_mean = {}'.format(param, best_value, min_test_rmse_mean)
    return xgb_params, best_value

### Tune max_depth and min_child_weight
- min_child_weight: Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree. Too high values can lead to under-fitting. 刚开始不要太大,先尽可能的"过拟合",再调整这个参数

In [72]:
xgb_params = fine_tune_param_cv(xgb_params, 'max_depth', common_num_range(8, 12, 1))

{'reg_alpha': 1, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 1, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 14, 'gamma': 0}
===> fine tuning max_depth...
fine tuning max_depth = 8
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62117	test-rmse:3.62206
[100]	train-rmse:2.21527	test-rmse:2.21771
[150]	train-rmse:1.37623	test-rmse:1.38162
[200]	train-rmse:0.88544	test-rmse:0.896288
[250]	train-rmse:0.610251	test-rmse:0.629517
[299]	train-rmse:0.468319	test-rmse:0.49735
CV RMSE : train_rmse_mean = 0.4683187 | test_rmse_mean = 0.4973503
fine tuning max_depth = 9
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62093	test-rmse:3.62202
[100]	train-rmse:2.21477	test-rmse:2.2178
[150]	train-rmse:1.37535	test-rmse:1.38213
[200]	train-rmse:0.883157	test-rmse:0.896964
[250]	train-rmse:0.605318	test-rmse:0.630063
[299]	train-r

In [84]:
xgb_params = {
    'eta': 0.01,
    'subsample': 0.93,
    
    'min_child_weight': 1,
    'colsample_bytree': 1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    'scale_pos_weight': 1,

    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [85]:
xgb_params = fine_tune_param_cv(xgb_params, 'subsample', common_num_range(0.92, 0.95, 0.01))

{'reg_alpha': 1, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 1, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 8, 'gamma': 0}
===> fine tuning subsample...
fine tuning subsample = 0.92
[0]	train-rmse:5.95731	test-rmse:5.95735
[50]	train-rmse:3.62119	test-rmse:3.62215
[100]	train-rmse:2.21527	test-rmse:2.21771
[150]	train-rmse:1.37646	test-rmse:1.38191
[200]	train-rmse:0.88574	test-rmse:0.896708
[250]	train-rmse:0.610293	test-rmse:0.629739
[299]	train-rmse:0.468328	test-rmse:0.497591
CV RMSE : train_rmse_mean = 0.468328 | test_rmse_mean = 0.497591
fine tuning subsample = 0.93
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62117	test-rmse:3.62206
[100]	train-rmse:2.21527	test-rmse:2.21771
[150]	train-rmse:1.37623	test-rmse:1.38162
[200]	train-rmse:0.88544	test-rmse:0.896288
[250]	train-rmse:0.610251	test-rmse:0.629517
[299]	tra

In [89]:
xgb_params = {
    'eta': 0.01,
    'min_child_weight': 1,
    'colsample_bytree': 1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    'scale_pos_weight': 1,

    'subsample': 0.93,
    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [90]:
xgb_params = fine_tune_param_cv(xgb_params, 'colsample_bytree', common_num_range(0.7, 1.1, 0.1))

{'reg_alpha': 1, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 1, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 8, 'gamma': 0}
===> fine tuning colsample_bytree...
fine tuning colsample_bytree = 0.7
[0]	train-rmse:5.95726	test-rmse:5.95727
[50]	train-rmse:3.62102	test-rmse:3.6219
[100]	train-rmse:2.21556	test-rmse:2.21794
[150]	train-rmse:1.37676	test-rmse:1.38204
[200]	train-rmse:0.886347	test-rmse:0.896846
[250]	train-rmse:0.611286	test-rmse:0.62984
[299]	train-rmse:0.469332	test-rmse:0.497482
CV RMSE : train_rmse_mean = 0.469332 | test_rmse_mean = 0.497482
fine tuning colsample_bytree = 0.8
[0]	train-rmse:5.95725	test-rmse:5.95726
[50]	train-rmse:3.62097	test-rmse:3.62185
[100]	train-rmse:2.21527	test-rmse:2.21766
[150]	train-rmse:1.37639	test-rmse:1.38169
[200]	train-rmse:0.886007	test-rmse:0.896599
[250]	train-rmse:0.610806	test-rmse

In [91]:
xgb_params = {
    'eta': 0.01,
    'min_child_weight': 1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    
    'scale_pos_weight': 1,
    'colsample_bytree': 1,
    'subsample': 0.93,
    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [92]:
xgb_params = fine_tune_param_cv(xgb_params, 'scale_pos_weight', common_num_range(0.7, 1.1, 0.1))

{'reg_alpha': 1, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 1, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 8, 'gamma': 0}
===> fine tuning scale_pos_weight...
fine tuning scale_pos_weight = 0.7
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62117	test-rmse:3.62206
[100]	train-rmse:2.21527	test-rmse:2.21771
[150]	train-rmse:1.37623	test-rmse:1.38162
[200]	train-rmse:0.88544	test-rmse:0.896288
[250]	train-rmse:0.610251	test-rmse:0.629517
[299]	train-rmse:0.468319	test-rmse:0.49735
CV RMSE : train_rmse_mean = 0.4683187 | test_rmse_mean = 0.4973503
fine tuning scale_pos_weight = 0.8
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62117	test-rmse:3.62206
[100]	train-rmse:2.21527	test-rmse:2.21771
[150]	train-rmse:1.37623	test-rmse:1.38162
[200]	train-rmse:0.88544	test-rmse:0.896288
[250]	train-rmse:0.610251	test-rms

In [101]:
xgb_params = {
    'eta': 0.01,
    'min_child_weight': 1,
    'reg_alpha': 1,
    'reg_lambda': 1,
    
    'scale_pos_weight': 1,
    'colsample_bytree': 1,
    'subsample': 0.93,
    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [102]:
xgb_params = fine_tune_param_cv(xgb_params, 'reg_alpha', common_num_range(0.005, 0.02, 0.0005))

{'reg_alpha': 1, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 1, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 8, 'gamma': 0}
===> fine tuning reg_alpha...
fine tuning reg_alpha = 0.005
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62096	test-rmse:3.62195
[100]	train-rmse:2.21433	test-rmse:2.21708
[150]	train-rmse:1.37452	test-rmse:1.3809
[200]	train-rmse:0.882233	test-rmse:0.895301
[250]	train-rmse:0.605042	test-rmse:0.62813
[299]	train-rmse:0.461715	test-rmse:0.496174
CV RMSE : train_rmse_mean = 0.461715 | test_rmse_mean = 0.496174
fine tuning reg_alpha = 0.0055
[0]	train-rmse:5.95724	test-rmse:5.95725
[50]	train-rmse:3.62097	test-rmse:3.62197
[100]	train-rmse:2.21419	test-rmse:2.21699
[150]	train-rmse:1.37453	test-rmse:1.381
[200]	train-rmse:0.882311	test-rmse:0.895372
[250]	train-rmse:0.605213	test-rmse:0.628252
[299]	tr

KeyboardInterrupt: 

In [108]:
xgb_params = {
    'eta': 0.01,
    'min_child_weight': 1,
    'reg_lambda': 1,
    
    'reg_alpha': 0.0095,
    'scale_pos_weight': 1,
    'colsample_bytree': 1,
    'subsample': 0.93,
    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [110]:
xgb_params = fine_tune_param_cv(xgb_params, 'reg_lambda', [0.005, 0.006, 0.007])

{'reg_alpha': 0.0095, 'eval_metric': 'rmse', 'scale_pos_weight': 1, 'gpu_id': 1, 'updater': 'grow_gpu', 'colsample_bytree': 1, 'silent': 1, 'nthread': -1, 'min_child_weight': 1, 'subsample': 0.93, 'reg_lambda': 0.07, 'eta': 0.01, 'objective': 'reg:linear', 'max_depth': 8, 'gamma': 0}
===> fine tuning reg_lambda...
fine tuning reg_lambda = 0.005
[0]	train-rmse:5.95717	test-rmse:5.9572
[50]	train-rmse:3.61812	test-rmse:3.61972
[100]	train-rmse:2.20966	test-rmse:2.21414
[150]	train-rmse:1.36764	test-rmse:1.37762
[200]	train-rmse:0.87304	test-rmse:0.892406
[250]	train-rmse:0.593597	test-rmse:0.626539
[299]	train-rmse:0.447602	test-rmse:0.495331
CV RMSE : train_rmse_mean = 0.447602 | test_rmse_mean = 0.4953307
fine tuning reg_lambda = 0.006
[0]	train-rmse:5.95717	test-rmse:5.95719
[50]	train-rmse:3.61789	test-rmse:3.61942
[100]	train-rmse:2.20969	test-rmse:2.21407
[150]	train-rmse:1.3678	test-rmse:1.37765
[200]	train-rmse:0.873793	test-rmse:0.892989
[250]	train-rmse:0.593534	test-rmse:0.626

In [112]:
xgb_params = {
    'eta': 0.01,
    
    'min_child_weight': 1,
    'reg_lambda': 0.006,
    'reg_alpha': 0.0095,
    'scale_pos_weight': 1,
    'colsample_bytree': 1,
    'subsample': 0.93,
    'gamma': 0,
    'max_depth': 8,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'updater': 'grow_gpu',
    'gpu_id': 1,
    'nthread': -1,
    'silent': 1
}

In [113]:
cv_result = xgb.cv(dict(xgb_params),
                   dtrain,
                   num_boost_round=300,
                   early_stopping_rounds=50,
                   verbose_eval=50,
                   show_stdv=False
                   )

[0]	train-rmse:5.95717	test-rmse:5.95719
[50]	train-rmse:3.61789	test-rmse:3.61942
[100]	train-rmse:2.20969	test-rmse:2.21407
[150]	train-rmse:1.3678	test-rmse:1.37765
[200]	train-rmse:0.873793	test-rmse:0.892989
[250]	train-rmse:0.593534	test-rmse:0.626298
[299]	train-rmse:0.447701	test-rmse:0.495103
