In [7]:
import os
import sys
module_path = os.path.abspath(os.path.join('../../'))
sys.path.append(module_path)

import cPickle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
color = sns.color_palette()
%matplotlib inline
matplotlib.style.use('ggplot')

import time
import numpy as np
import pandas as pd
from IPython.display import display

# remove warnings
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from itertools import product

# my module
from conf.configure import Configure
from utils import data_utils, dataframe_util
from utils.common_utils import common_num_range

import model.get_datasets as gd
from conf.configure import Configure

# Load Datasets

In [10]:
# 构建模型输入
def pre_train():
    with open('./level1_train.pkl', "rb") as f:
        train = cPickle.load(f)
    with open('./level1_test.pkl', "rb") as f:
        test = cPickle.load(f)

    y_train_all = train['orderType']
    id_train = train['userid']
    train.drop(['orderType', 'userid'], axis=1, inplace=True)

    id_test = test['userid']
    test.drop(['userid'], axis=1, inplace=True)

    train = train[test.columns.values]

    print("train_all: ({}), test: ({})".format(train.shape, test.shape))
    return train, y_train_all, id_train, test, id_test

print("load level-1 train test datasets")
train, y_train_all, id_train, test, id_test = pre_train()

df_columns = train.columns

load level-1 train test datasets
train_all: ((40307, 230)), test: ((10076, 230))


In [11]:
dtrain = xgb.DMatrix(train.values, y_train_all, feature_names=df_columns)
dtest = xgb.DMatrix(test, feature_names=df_columns)

# Parameter Fine Tuning

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from itertools import product

def model_cross_validate(xgb_params, cv_param_dict, dtrain, cv_num_boost_round=4000, early_stopping_rounds=100, cv_nfold=5, stratified=True):
    params_value = []
    params_name = cv_param_dict.keys()
    max_auc = 0
    for param in params_name:
        params_value.append(cv_param_dict[param])

    for param_pair in product(*params_value):
        param_str = ''
        for i in xrange(len(param_pair)):
            param_str += params_name[i] + '=' + str(param_pair[i]) + ' '
            xgb_params[params_name[i]] = param_pair[i]
        
        start = time.time()
        cv_result = xgb.cv(xgb_params, dtrain, num_boost_round=cv_num_boost_round, stratified=stratified,
                           nfold=cv_nfold, early_stopping_rounds=early_stopping_rounds)
        
        best_num_boost_rounds = len(cv_result)
        mean_test_auc = cv_result.loc[best_num_boost_rounds - 6: best_num_boost_rounds - 1, 'test-auc-mean'].mean()
        if mean_test_auc > max_auc:
            best_param = param_pair
            max_auc = mean_test_auc
        
        end = time.time()
        print('Tuning paramter: {}, best_ntree_limit:{}, auc = {:.7f}, cost: {}s'.format(param_str, best_num_boost_rounds,
                                                                              mean_test_auc, end-start))
    param_str = ''
    for i in xrange(len(best_param)):
        param_str += params_name[i] + '=' + str(best_param[i]) + ' '
        xgb_params[params_name[i]] = best_param[i]
    print('===========best paramter: {} auc={:.7f}==========='.format(param_str, max_auc))

### Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

### Baseline model

In [13]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 5,
    'min_child_weight': 1,
    'scale_pos_weight': 1,
    'gamma': 0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'updater': 'grow_gpu',
    'gpu_id':0,
    'nthread': -1,
    'silent': 1,
    'booster': 'gbtree',
}

In [14]:
print('---> calc baseline model')

cv_num_boost_round=4000
early_stopping_rounds=100
cv_nfold=5
stratified=True

cv_result = xgb.cv(xgb_params,
                   dtrain,
                   nfold=cv_nfold,
                   stratified=stratified,
                   num_boost_round=cv_num_boost_round,
                   early_stopping_rounds=early_stopping_rounds,
                   )
best_num_boost_rounds = len(cv_result)
mean_train_auc = cv_result.loc[best_num_boost_rounds-6 : best_num_boost_rounds-1, 'train-auc-mean'].mean()
mean_test_auc = cv_result.loc[best_num_boost_rounds-6 : best_num_boost_rounds-1, 'test-auc-mean'].mean()

print('mean_train_auc = {:.7f} , mean_test_auc = {:.7f}\n'.format(mean_train_auc, mean_test_auc))

---> calc baseline model
mean_train_auc = 0.9854473 , mean_test_auc = 0.9734381



### Fine tune *max_depth* and *min_child_weight*

In [15]:
cv_paramters = {'max_depth':range(5,15,2),'min_child_weight':range(1,10,2)}
model_cross_validate(xgb_params, cv_paramters, dtrain)

Tuning paramter: max_depth=5 min_child_weight=1 , best_ntree_limit:73, auc = 0.9734381, cost: 36.0512120724s
Tuning paramter: max_depth=5 min_child_weight=3 , best_ntree_limit:63, auc = 0.9736066, cost: 34.1637780666s
Tuning paramter: max_depth=5 min_child_weight=5 , best_ntree_limit:64, auc = 0.9734786, cost: 34.3443319798s
Tuning paramter: max_depth=5 min_child_weight=7 , best_ntree_limit:68, auc = 0.9733758, cost: 34.8901040554s
Tuning paramter: max_depth=5 min_child_weight=9 , best_ntree_limit:63, auc = 0.9734722, cost: 34.2463109493s
Tuning paramter: max_depth=7 min_child_weight=1 , best_ntree_limit:82, auc = 0.9733780, cost: 49.3304550648s
Tuning paramter: max_depth=7 min_child_weight=3 , best_ntree_limit:73, auc = 0.9732754, cost: 46.6479659081s
Tuning paramter: max_depth=7 min_child_weight=5 , best_ntree_limit:54, auc = 0.9733374, cost: 41.8178870678s
Tuning paramter: max_depth=7 min_child_weight=7 , best_ntree_limit:45, auc = 0.9731912, cost: 39.2651641369s
Tuning paramter: ma

In [12]:
cv_paramters = {'max_depth':range(4,7,1),'min_child_weight':range(6,9,1)}
model_cross_validate(xgb_params, cv_paramters, dtrain)

Tuning paramter: max_depth=4 min_child_weight=6 , best_ntree_limit:54, auc = 0.9732905, cost: 25.8395490646s
Tuning paramter: max_depth=4 min_child_weight=7 , best_ntree_limit:58, auc = 0.9734178, cost: 26.1778271198s
Tuning paramter: max_depth=4 min_child_weight=8 , best_ntree_limit:64, auc = 0.9734460, cost: 27.112226963s
Tuning paramter: max_depth=5 min_child_weight=6 , best_ntree_limit:56, auc = 0.9732834, cost: 29.7443609238s
Tuning paramter: max_depth=5 min_child_weight=7 , best_ntree_limit:59, auc = 0.9733319, cost: 30.3012568951s
Tuning paramter: max_depth=5 min_child_weight=8 , best_ntree_limit:52, auc = 0.9731810, cost: 29.3285639286s
Tuning paramter: max_depth=6 min_child_weight=6 , best_ntree_limit:53, auc = 0.9730169, cost: 33.7446899414s
Tuning paramter: max_depth=6 min_child_weight=7 , best_ntree_limit:54, auc = 0.9731490, cost: 33.6167500019s
Tuning paramter: max_depth=6 min_child_weight=8 , best_ntree_limit:47, auc = 0.9730791, cost: 32.1935799122s


In [14]:
xgb_params

{'booster': 'gbtree',
 'colsample_bytree': 0.8,
 'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 0,
 'gpu_id': 0,
 'max_depth': 4,
 'min_child_weight': 8,
 'nthread': -1,
 'objective': 'binary:logistic',
 'scale_pos_weight': 1,
 'silent': 1,
 'subsample': 0.8,
 'updater': 'grow_gpu'}

### Tune gamma

In [16]:
cv_paramters={'gamma':common_num_range(0,10,1)}
# model_cross_validate(xgb_params, cv_paramters, dtrain)

### Tune subsample and colsample_bytree

In [18]:
cv_paramters = {'subsample':common_num_range(0.5, 1.1, 0.1), 'colsample_bytree':common_num_range(0.5, 1.1, 0.1)}
model_cross_validate(xgb_params,cv_paramters,dtrain)

Tuning paramter: subsample=0.5 colsample_bytree=0.5 , best_ntree_limit:70, auc = 0.9731489, cost: 24.0094490051s
Tuning paramter: subsample=0.5 colsample_bytree=0.6 , best_ntree_limit:54, auc = 0.9733420, cost: 22.4456419945s
Tuning paramter: subsample=0.5 colsample_bytree=0.7 , best_ntree_limit:60, auc = 0.9732883, cost: 24.2585279942s
Tuning paramter: subsample=0.5 colsample_bytree=0.8 , best_ntree_limit:63, auc = 0.9731890, cost: 26.9154629707s
Tuning paramter: subsample=0.5 colsample_bytree=0.9 , best_ntree_limit:40, auc = 0.9731698, cost: 24.2228560448s
Tuning paramter: subsample=0.5 colsample_bytree=1.0 , best_ntree_limit:57, auc = 0.9732814, cost: 27.7345571518s
Tuning paramter: subsample=0.6 colsample_bytree=0.5 , best_ntree_limit:49, auc = 0.9731815, cost: 21.2189290524s
Tuning paramter: subsample=0.6 colsample_bytree=0.6 , best_ntree_limit:57, auc = 0.9733749, cost: 22.9462361336s
Tuning paramter: subsample=0.6 colsample_bytree=0.7 , best_ntree_limit:59, auc = 0.9735234, cost

In [19]:
xgb_params

{'booster': 'gbtree',
 'colsample_bytree': 0.7,
 'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 2,
 'gpu_id': 0,
 'max_depth': 4,
 'min_child_weight': 8,
 'nthread': -1,
 'objective': 'binary:logistic',
 'scale_pos_weight': 1,
 'silent': 1,
 'subsample': 0.6,
 'updater': 'grow_gpu'}

### Tuning Regularization Parameters: alpha, lambda

In [21]:
cv_paramters = {'alpha':[1e-5, 0.1, 1, 10, 100],
                'lambda':[1e-5, 0.1, 1, 10, 100]}
model_cross_validate(xgb_params,cv_paramters,dtrain)

Tuning paramter: alpha=1e-05 lambda=1e-05 , best_ntree_limit:72, auc = 0.9734046, cost: 26.0356299877s
Tuning paramter: alpha=1e-05 lambda=0.1 , best_ntree_limit:67, auc = 0.9734315, cost: 25.136633873s
Tuning paramter: alpha=1e-05 lambda=1 , best_ntree_limit:59, auc = 0.9735237, cost: 24.3639791012s
Tuning paramter: alpha=1e-05 lambda=10 , best_ntree_limit:69, auc = 0.9734861, cost: 25.3926680088s
Tuning paramter: alpha=1e-05 lambda=100 , best_ntree_limit:231, auc = 0.9733721, cost: 47.7388989925s
Tuning paramter: alpha=0.1 lambda=1e-05 , best_ntree_limit:58, auc = 0.9733950, cost: 23.9442150593s
Tuning paramter: alpha=0.1 lambda=0.1 , best_ntree_limit:67, auc = 0.9733426, cost: 25.2285971642s
Tuning paramter: alpha=0.1 lambda=1 , best_ntree_limit:67, auc = 0.9735321, cost: 25.1685349941s
Tuning paramter: alpha=0.1 lambda=10 , best_ntree_limit:90, auc = 0.9734943, cost: 28.3662488461s
Tuning paramter: alpha=0.1 lambda=100 , best_ntree_limit:136, auc = 0.9733611, cost: 34.6439578533s
T

### Reducing Learning Rate and Done!

In [22]:
xgb_params

{'alpha': 0.1,
 'booster': 'gbtree',
 'colsample_bytree': 0.7,
 'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 2,
 'gpu_id': 0,
 'lambda': 1,
 'max_depth': 4,
 'min_child_weight': 8,
 'nthread': -1,
 'objective': 'binary:logistic',
 'scale_pos_weight': 1,
 'silent': 1,
 'subsample': 0.6,
 'updater': 'grow_gpu'}

In [23]:
xgb_params['eta'] = 0.01

In [24]:
xgb_params

{'alpha': 0.1,
 'booster': 'gbtree',
 'colsample_bytree': 0.7,
 'eta': 0.01,
 'eval_metric': 'auc',
 'gamma': 2,
 'gpu_id': 0,
 'lambda': 1,
 'max_depth': 4,
 'min_child_weight': 8,
 'nthread': -1,
 'objective': 'binary:logistic',
 'scale_pos_weight': 1,
 'silent': 1,
 'subsample': 0.6,
 'updater': 'grow_gpu'}