In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
color = sns.color_palette()
%matplotlib inline
matplotlib.style.use('ggplot')

import time
import numpy as np
import pandas as pd
from IPython.display import display

# remove warnings
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from itertools import product

# my module
from conf.configure import Configure
from utils import data_utils, dataframe_util
from utils.common_utils import common_num_range

import model.get_datasets as gd

# Load Datasets

In [2]:
train = pd.read_csv(Configure.base_path + 'huang_lin/train_dataHL.csv')
test = pd.read_csv(Configure.base_path + 'huang_lin/test_dataHL.csv')

y_train = train['orderType']
train.drop(['orderType'], axis=1, inplace=True)

df_columns = train.columns.values
print('train: {}, test: {}, feature count: {}, orderType 1:0 = {:.5f}'.format(
    train.shape[0], test.shape[0], len(df_columns), 1.0*sum(y_train) / len(y_train)))

train: 40307, test: 10076, feature count: 368, orderType 1:0 = 0.16436


In [3]:
dtrain = lgbm.Dataset(train, label=y_train)

# Parameter Fine Tuning

In [4]:
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_curve
from itertools import product

def model_cross_validate(model_params, cv_param_dict, dtrain, cv_num_boost_round=4000, early_stopping_rounds=100, cv_nfold=5, stratified=True):
    params_value = []
    params_name = cv_param_dict.keys()
    max_auc = 0
    for param in params_name:
        params_value.append(cv_param_dict[param])

    for param_pair in product(*params_value):
        param_str = ''
        for i in xrange(len(param_pair)):
            param_str += params_name[i] + '=' + str(param_pair[i]) + ' '
            model_params[params_name[i]] = param_pair[i]
        
        start = time.time()
        cv_result = lgbm.cv(model_params, dtrain, num_boost_round=cv_num_boost_round, stratified=stratified,
                           nfold=cv_nfold, early_stopping_rounds=early_stopping_rounds)
        
        best_num_boost_rounds = len(cv_result['auc-mean'])
        mean_test_auc = np.mean(cv_result['auc-mean'][best_num_boost_rounds-6 : best_num_boost_rounds-1])
        if mean_test_auc > max_auc:
            best_param = param_pair
            max_auc = mean_test_auc
        
        end = time.time()
        print('{}, best_ntree_limit:{}, auc = {:.7f}, cost: {}s'.format(param_str, best_num_boost_rounds,
                                                                              mean_test_auc, end-start))
    param_str = ''
    for i in xrange(len(best_param)):
        param_str += params_name[i] + '=' + str(best_param[i]) + ' '
        model_params[params_name[i]] = best_param[i]
    print('===========best paramter: {} auc={:.7f}==========='.format(param_str, max_auc))

### Step 1: Fix learning rate and number of estimators for tuning tree-based parameters

### Baseline model

In [11]:
lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'min_split_gain': 0,
        'min_child_weight': 4,
        'learning_rate': 0.1,
        'num_leaves': 64,
        'min_sum_hessian_in_leaf': 0.1,
        'feature_fraction': 0.5,
        'feature_fraction_seed': 10,
        'bagging_fraction': 0.6,
        'bagging_seed': 10,
        'lambda_l1': 0.5,
        'lambda_l2': 0.5,
        'num_thread': -1,
        'verbose': 0
    }

In [12]:
print('---> calc baseline model')

cv_num_boost_round=4000
early_stopping_rounds=100
cv_nfold=5
stratified=True

cv_result = lgbm.cv(lgbm_params,
                    dtrain,
                    nfold=cv_nfold,
                    stratified=stratified,
                    num_boost_round=cv_num_boost_round,
                    early_stopping_rounds=early_stopping_rounds,
                    )
best_num_boost_rounds = len(cv_result['auc-mean'])
mean_test_auc = np.mean(cv_result['auc-mean'][best_num_boost_rounds-6 : best_num_boost_rounds-1])

print('mean_test_auc = {:.7f}\n'.format(mean_test_auc))

---> calc baseline model
mean_test_auc = 0.9717915



### Fine tune *num_leaves* and *min_child_weight*

In [8]:
cv_paramters = {'num_leaves':[2**5, 2**6, 2**7]}
model_cross_validate(lgbm_params, cv_paramters, dtrain)

num_leaves=32 , best_ntree_limit:306, auc = 0.9707847, cost: 28.4157309532s
num_leaves=64 , best_ntree_limit:441, auc = 0.9717915, cost: 60.2170989513s
num_leaves=128 , best_ntree_limit:305, auc = 0.9710133, cost: 68.8745148182s


### Tune bagging_fraction and feature_fraction

In [9]:
cv_paramters = {'bagging_fraction':common_num_range(0.5, 1.1, 0.1), 'feature_fraction':common_num_range(0.5,1.1,0.1)}
model_cross_validate(lgbm_params, cv_paramters,dtrain)

bagging_fraction=0.5 feature_fraction=0.5 , best_ntree_limit:441, auc = 0.9717915, cost: 60.1709551811s
bagging_fraction=0.5 feature_fraction=0.6 , best_ntree_limit:494, auc = 0.9713985, cost: 68.082005024s
bagging_fraction=0.5 feature_fraction=0.7 , best_ntree_limit:436, auc = 0.9716777, cost: 64.8330609798s
bagging_fraction=0.5 feature_fraction=0.8 , best_ntree_limit:400, auc = 0.9715962, cost: 63.5356798172s
bagging_fraction=0.5 feature_fraction=0.9 , best_ntree_limit:342, auc = 0.9711587, cost: 59.5559711456s
bagging_fraction=0.5 feature_fraction=1.0 , best_ntree_limit:476, auc = 0.9714961, cost: 76.5802919865s
bagging_fraction=0.6 feature_fraction=0.5 , best_ntree_limit:441, auc = 0.9717915, cost: 59.8302419186s
bagging_fraction=0.6 feature_fraction=0.6 , best_ntree_limit:494, auc = 0.9713985, cost: 68.1966850758s
bagging_fraction=0.6 feature_fraction=0.7 , best_ntree_limit:436, auc = 0.9716777, cost: 64.6554200649s
bagging_fraction=0.6 feature_fraction=0.8 , best_ntree_limit:400,

### Tuning Regularization Parameters: lambda_l1, lambda_l2

In [10]:
cv_paramters = {'lambda_l1':[0, 1, 10, 50, 100],'lambda_l2':[0, 1, 10, 50, 100]}
model_cross_validate(lgbm_params,cv_paramters,dtrain)

lambda_l1=0 lambda_l2=0 , best_ntree_limit:823, auc = 0.9710004, cost: 93.9876971245s
lambda_l1=0 lambda_l2=1 , best_ntree_limit:681, auc = 0.9717348, cost: 89.2005209923s
lambda_l1=0 lambda_l2=10 , best_ntree_limit:310, auc = 0.9712283, cost: 49.6835539341s
lambda_l1=0 lambda_l2=50 , best_ntree_limit:420, auc = 0.9711120, cost: 61.5471329689s
lambda_l1=0 lambda_l2=100 , best_ntree_limit:416, auc = 0.9711464, cost: 60.1774940491s


KeyboardInterrupt: 

### Reducing Learning Rate and Done!

In [11]:
lgbm_params['feature_fraction'] = 0.9
lgbm_params['bagging_fraction'] = 0.7

In [12]:
lgbm_params

{'bagging_fraction': 0.7,
 'bagging_seed': 10,
 'boosting_type': 'gbdt',
 'feature_fraction': 0.9,
 'feature_fraction_seed': 10,
 'lambda_l1': 0.5,
 'lambda_l2': 0.5,
 'learning_rate': 0.1,
 'max_bin': 255,
 'metric': 'auc',
 'min_child_weight': 1,
 'min_split_gain': 0,
 'min_sum_hessian_in_leaf': 0.1,
 'num_leaves': 64,
 'num_thread': -1,
 'objective': 'binary',
 'verbose': 0}