In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import lightgbm as lgbm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   # Perforing grid search

from IPython.display import display

from datetime import timedelta
import datetime as dt
from utils import data_utils, dataframe_util

# remove warnings
import warnings
warnings.filterwarnings('ignore')



In [34]:
print 'load dataset from op_scope = {}'.format(7)
train, test = data_utils.load_dataset(7)
train.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)
test.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)

random_indexs = np.arange(0, train.shape[0], 100)
train = train.iloc[random_indexs, :]

train['trip_duration'] = np.log(train['trip_duration'])
y_train_all = train['trip_duration']
# del train['id']
del train['trip_duration']
id_test = test['id']
# del test['id']

train['id'] = train['id'].map(lambda i: int(i[2:]))
test['id'] = test['id'].map(lambda i: int(i[2:]))

print 'train:', train.shape, ', test:', test.shape

load dataset from op_scope = 7
train: (14587, 98) , test: (625134, 98)


In [35]:
d_train = lgbm.Dataset(train, label=y_train_all)

In [36]:
def lgb_rmsle_score(preds, dtrain):
    labels = np.exp(dtrain.get_label())
    preds = np.exp(preds.clip(min=0))
    return 'rmsle', np.sqrt(np.mean(np.square(np.log1p(preds) - np.log1p(labels)))), False


In [37]:
import math

def common_num_range(start,stop,step):
    
    startlen = stoplen = steplen = 0
    if '.' in str(start):
        startlen = len(str(start)) - str(start).index('.') - 1
    if '.' in str(stop):
        stoplen = len(str(stop)) - str(stop).index('.') - 1
    if '.' in str(step):
        steplen = len(str(step)) - str(step).index('.') - 1
    
    maxlen = startlen
    if stoplen > maxlen:
        maxlen = stoplen
    if steplen > maxlen:
        maxlen = steplen
    
    power = math.pow(10, maxlen)
    
    if startlen == 0 and stoplen == 0 and steplen == 0:
        return range(start, stop, step)
    else:
        return [num / power for num in range(int(start*power), int(stop*power), int(step*power))]

## Baseline Lightgbm

In [40]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'num_leaves': 2 ** 4,
        'learning_rate': 0.05,
        'max_depth': -1,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [41]:
cv_results = lgbm.cv(lgbm_params,
                     d_train,
                     num_boost_round=50,
                     nfold=3,
                     feval=lgb_rmsle_score,
                     early_stopping_rounds=300,
                     verbose_eval=50)

[50]	cv_agg's rmsle: 0.427718 + 0.00978353


## Param Fine Tuning

In [50]:
def fine_tune_param_cv(lgbm_params, param, values):
    print lgbm_params
    print '===> fine tuning {}...'.format(param)
    min_test_rmse_mean = 100
    best_value = lgbm_params[param]
    for value in values:
        print 'fine tuning {} = {}'.format(param, value)
        lgbm_params[param] = value
        cv_results = lgbm.cv(lgbm_params,
                     d_train,
                     num_boost_round=50,
                     nfold=3,
                     feval=lgb_rmsle_score,
                     early_stopping_rounds=300,
                     verbose_eval=50)
        
        test_rmse_mean = cv_results['rmsle-mean'][-1]
        print "CV RMSE : test_rmse_mean = %.7g" % (test_rmse_mean)
        if test_rmse_mean < min_test_rmse_mean:
            min_test_rmse_mean = test_rmse_mean
            best_value = value
    
    print 'best {} = {}, min_test_rmse_mean = {}'.format(param, best_value, min_test_rmse_mean)
    return lgbm_params, best_value

In [54]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [55]:
_ = fine_tune_param_cv(lgbm_params, 'subsample', common_num_range(0.6, 0.76, 0.05))

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.8, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning subsample...
fine tuning subsample = 0.6
[50]	cv_agg's rmsle: 0.418398 + 0.00926433
CV RMSE : test_rmse_mean = 0.4183985
fine tuning subsample = 0.65
[50]	cv_agg's rmsle: 0.41858 + 0.0107418
CV RMSE : test_rmse_mean = 0.4185804
fine tuning subsample = 0.7
[50]	cv_agg's rmsle: 0.418276 + 0.010378
CV RMSE : test_rmse_mean = 0.4182758
fine tuning subsample = 0.75
[50]	cv_agg's rmsle: 0.416846 + 0.0102225
CV RMSE : test_rmse_mean = 0.4168464
fine tuning subsample = 0.8
[50]	cv_agg's rmsle: 0.419552 + 0.0100412
CV RMSE : test_rmse_mean = 0.419552
fine tuning subsample = 0.85


KeyboardInterrupt: 

In [60]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        
        'colsample_bytree': 0.6,
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [61]:
_ = fine_tune_param_cv(lgbm_params, 'colsample_bytree', common_num_range(0.6, 0.7, 0.05))

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.75, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning colsample_bytree...
fine tuning colsample_bytree = 0.6
[50]	cv_agg's rmsle: 0.416846 + 0.0102225
CV RMSE : test_rmse_mean = 0.4168464
fine tuning colsample_bytree = 0.65
[50]	cv_agg's rmsle: 0.417083 + 0.0109581
CV RMSE : test_rmse_mean = 0.4170834
best colsample_bytree = 0.6, min_test_rmse_mean = 0.416846391977


In [62]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [63]:
_ = fine_tune_param_cv(lgbm_params, 'max_depth', [8,10,12,14])

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.75, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning max_depth...
fine tuning max_depth = 8
[50]	cv_agg's rmsle: 0.417636 + 0.0107109
CV RMSE : test_rmse_mean = 0.4176359
fine tuning max_depth = 10
[50]	cv_agg's rmsle: 0.416846 + 0.0102225
CV RMSE : test_rmse_mean = 0.4168464
fine tuning max_depth = 12
[50]	cv_agg's rmsle: 0.417239 + 0.0103711
CV RMSE : test_rmse_mean = 0.4172391
fine tuning max_depth = 14
[50]	cv_agg's rmsle: 0.417241 + 0.0100959
CV RMSE : test_rmse_mean = 0.4172415
best max_depth = 10, min_test_rmse_mean = 0.416846391977


In [64]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [65]:
_ = fine_tune_param_cv(lgbm_params, 'num_leaves', [2 ** 6, 2 ** 7, 2 ** 8, 2 ** 9])

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.75, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning num_leaves...
fine tuning num_leaves = 64
[50]	cv_agg's rmsle: 0.416846 + 0.0102225
CV RMSE : test_rmse_mean = 0.4168464
fine tuning num_leaves = 128
[50]	cv_agg's rmsle: 0.417186 + 0.0100136
CV RMSE : test_rmse_mean = 0.4171859
fine tuning num_leaves = 256
[50]	cv_agg's rmsle: 0.417506 + 0.00922725
CV RMSE : test_rmse_mean = 0.4175057
fine tuning num_leaves = 512
[50]	cv_agg's rmsle: 0.417506 + 0.00922725
CV RMSE : test_rmse_mean = 0.4175057
best num_leaves = 64, min_test_rmse_mean = 0.416846391977


In [68]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 14,
#         'max_bin': 255,
#         'subsample_for_bin': 50000,
        
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [69]:
_ = fine_tune_param_cv(lgbm_params, 'num_leaves', [2 ** 6, 2 ** 7, 2 ** 8, 2 ** 9])

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.75, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 14, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning num_leaves...
fine tuning num_leaves = 64
[50]	cv_agg's rmsle: 0.417241 + 0.0100959
CV RMSE : test_rmse_mean = 0.4172415
fine tuning num_leaves = 128
[50]	cv_agg's rmsle: 0.417514 + 0.0102657
CV RMSE : test_rmse_mean = 0.4175136
fine tuning num_leaves = 256
[50]	cv_agg's rmsle: 0.417219 + 0.0111645
CV RMSE : test_rmse_mean = 0.4172192
fine tuning num_leaves = 512
[50]	cv_agg's rmsle: 0.417219 + 0.0111645
CV RMSE : test_rmse_mean = 0.4172192
best num_leaves = 256, min_test_rmse_mean = 0.417219162206


In [79]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
        
        'reg_alpha': 1,
        'reg_lambda': 0,
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [80]:
_ = fine_tune_param_cv(lgbm_params, 'min_split_gain', [0.3, 0.4])

{'num_leaves': 64, 'reg_alpha': 1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'early_stopping_round': 20, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_child_weight': 1, 'min_split_gain': 0.5, 'subsample': 0.75, 'reg_lambda': 0, 'objective': 'regression', 'verbose': 0, 'min_child_samples': 10, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning min_split_gain...
fine tuning min_split_gain = 0.3
[50]	cv_agg's rmsle: 0.41662 + 0.00985629
CV RMSE : test_rmse_mean = 0.4166198
fine tuning min_split_gain = 0.4
[50]	cv_agg's rmsle: 0.416193 + 0.00968524
CV RMSE : test_rmse_mean = 0.4161932
best min_split_gain = 0.4, min_test_rmse_mean = 0.416193170302


In [88]:
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'min_split_gain': 0.4,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
        
        'reg_alpha': 0.1,
        'reg_lambda': 0,
        
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }

In [90]:
_ = fine_tune_param_cv(lgbm_params, 'reg_lambda', [0, 0.01, 0.1])

{'num_leaves': 64, 'reg_alpha': 0.1, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'scale_pos_weight': 1, 'learning_rate': 0.05, 'nthread': -1, 'min_split_gain': 0.4, 'subsample': 0.75, 'reg_lambda': 10, 'objective': 'regression', 'verbose': 0, 'max_depth': 10, 'metric': 'rmsle', 'boosting_type': 'gbdt'}
===> fine tuning reg_lambda...
fine tuning reg_lambda = 0
[50]	cv_agg's rmsle: 0.414976 + 0.0106041
CV RMSE : test_rmse_mean = 0.4149757
fine tuning reg_lambda = 0.01
[50]	cv_agg's rmsle: 0.415468 + 0.0104646
CV RMSE : test_rmse_mean = 0.4154676
fine tuning reg_lambda = 0.1
[50]	cv_agg's rmsle: 0.414419 + 0.0104
CV RMSE : test_rmse_mean = 0.4144187
best reg_lambda = 0.1, min_test_rmse_mean = 0.414418724084


```
lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'subsample': 0.75,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'min_split_gain': 0.4,
    
        'num_leaves': 2 ** 6,
        'learning_rate': 0.05,
        'max_depth': 10,
        
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        
        'scale_pos_weight': 1,
        'early_stopping_round': 20,
        'metric': 'rmsle',
        'verbose': 0
    }
```