In [1]:
import os
import sys

import cPickle

module_path = os.path.abspath(os.path.join('../../'))
sys.path.append(module_path)

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
color = sns.color_palette()
%matplotlib inline
matplotlib.style.use('ggplot')

from utils import xgb_utils
from conf.configure import Configure

In [2]:
train = pd.read_csv(Configure.base_path + 'train/orderFuture_train.csv', encoding='utf8')
test = pd.read_csv(Configure.base_path + 'test/orderFuture_test.csv', encoding='utf8')
print train.shape, test.shape

(40307, 2) (10076, 1)


## 加载 lq 数据集 RF 和 ET 预测结果

In [3]:
with open('lq_dataset_et_train.pkl', "rb") as f:
    lq_et_train = cPickle.load(f)
with open('lq_dataset_et_test.pkl', "rb") as f:
    lq_et_test = cPickle.load(f)

with open('lq_dataset_rf_train.pkl', "rb") as f:
    lq_rf_train = cPickle.load(f)
with open('lq_dataset_rf_test.pkl', "rb") as f:
    lq_rf_test = cPickle.load(f)

train = pd.merge(train, lq_et_train, on='userid', how='left')
train = pd.merge(train, lq_rf_train, on='userid', how='left')

test = pd.merge(test, lq_et_test, on='userid', how='left')
test = pd.merge(test, lq_rf_test, on='userid', how='left')

print train.shape, test.shape

(40307, 22) (10076, 21)


## 加载 lq 数据集 Adaboost 预测结果

In [4]:
with open('lq_dataset_ada_train.pkl', "rb") as f:
    lq_train = cPickle.load(f)
with open('lq_dataset_ada_test.pkl', "rb") as f:
    lq_test = cPickle.load(f)

train = pd.merge(train, lq_train, on='userid', how='left')
test = pd.merge(test, lq_test, on='userid', how='left')

print train.shape, test.shape

(40307, 23) (10076, 22)


## 加载 lq 数据集 XGB 和 LGBM 预测结果

In [5]:
train_files = os.listdir('./train')
for train_f in train_files:
    if train_f.startswith('lq'):
        lq_train = pd.read_csv('./train/'+train_f)
        train = pd.merge(train, lq_train, on='userid', how='left')

test_files = os.listdir('./test')
for test_f in test_files:
    if test_f.startswith('lq'):
        lq_test = pd.read_csv('./test/'+test_f)
        test = pd.merge(test, lq_test, on='userid', how='left')

print train.shape, test.shape

(40307, 63) (10076, 62)


## 加载 lq 数据集 Catboost 预测结果

In [6]:
train_files = os.listdir('./train/catboost')
for train_f in train_files:
    if train_f.startswith('lq'):
        lq_train = pd.read_csv('./train/catboost/'+train_f)
        train = pd.merge(train, lq_train, on='userid', how='left')

test_files = os.listdir('./test/catboost')
for test_f in test_files:
    if test_f.startswith('lq'):
        lq_test = pd.read_csv('./test/catboost/'+test_f)
        test = pd.merge(test, lq_test, on='userid', how='left')

print train.shape, test.shape

(40307, 73) (10076, 72)


## 加载 hl 数据集 XGB 和 LGBM 预测结果

In [7]:
train_files = os.listdir('./train')
for train_f in train_files:
    if train_f.startswith('hl'):
        lq_train = pd.read_csv('./train/'+train_f)
        train = pd.merge(train, lq_train, on='userid', how='left')

test_files = os.listdir('./test')
for test_f in test_files:
    if test_f.startswith('hl'):
        lq_test = pd.read_csv('./test/'+test_f)
        test = pd.merge(test, lq_test, on='userid', how='left')
        
print train.shape, test.shape

(40307, 113) (10076, 112)


## 加载 hl 数据集 RF 和 ET 预测结果

In [8]:
with open('hl_dataset_et_train.pkl', "rb") as f:
    hl_et_train = cPickle.load(f)
with open('hl_dataset_et_test.pkl', "rb") as f:
    hl_et_test = cPickle.load(f)

with open('hl_dataset_rf_train.pkl', "rb") as f:
    hl_rf_train = cPickle.load(f)
with open('hl_dataset_rf_test.pkl', "rb") as f:
    hl_rf_test = cPickle.load(f)

train = pd.merge(train, hl_et_train, on='userid', how='left')
train = pd.merge(train, hl_rf_train, on='userid', how='left')

test = pd.merge(test, hl_et_test, on='userid', how='left')
test = pd.merge(test, hl_rf_test, on='userid', how='left')

print train.shape, test.shape

(40307, 133) (10076, 132)


## 加载 hl 数据集 Catboost 预测结果

In [9]:
train_files = os.listdir('./train/catboost')
for train_f in train_files:
    if train_f.startswith('hl'):
        lq_train = pd.read_csv('./train/catboost/'+train_f)
        train = pd.merge(train, lq_train, on='userid', how='left')

test_files = os.listdir('./test/catboost')
for test_f in test_files:
    if test_f.startswith('hl'):
        lq_test = pd.read_csv('./test/catboost/'+test_f)
        test = pd.merge(test, lq_test, on='userid', how='left')

print train.shape, test.shape

(40307, 141) (10076, 140)


## 加载 sqg 数据集 XGB 和 LGBM 预测结果

In [10]:
train_files = os.listdir('./train')
for train_f in train_files:
    if train_f.startswith('sqg'):
        sqg_train = pd.read_csv('./train/'+train_f)
        train = pd.merge(train, sqg_train, on='userid', how='left')

test_files = os.listdir('./test')
for test_f in test_files:
    if test_f.startswith('sqg'):
        sqg_test = pd.read_csv('./test/'+test_f)
        test = pd.merge(test, sqg_test, on='userid', how='left')

print train.shape, test.shape

(40307, 186) (10076, 185)


## 加载 sqg 数据集 RF 和 ET 预测结果

In [11]:
with open('sqg_dataset_et_train.pkl', "rb") as f:
    sqg_et_train = cPickle.load(f)
with open('sqg_dataset_et_test.pkl', "rb") as f:
    sqg_et_test = cPickle.load(f)

with open('sqg_dataset_rf_train.pkl', "rb") as f:
    sqg_rf_train = cPickle.load(f)
with open('sqg_dataset_rf_test.pkl', "rb") as f:
    sqg_rf_test = cPickle.load(f)

train = pd.merge(train, sqg_et_train, on='userid', how='left')
train = pd.merge(train, sqg_rf_train, on='userid', how='left')

test = pd.merge(test, sqg_et_test, on='userid', how='left')
test = pd.merge(test, sqg_rf_test, on='userid', how='left')

print train.shape, test.shape

(40307, 206) (10076, 205)


## 加载 sqg 数据集 Catboost 预测结果

In [12]:
train_files = os.listdir('./train/')
for train_f in train_files:
    if train_f.startswith('sqg_cat'):
        sqg_train = pd.read_csv('./train/'+train_f)
        sqg_train.columns = ['userid', 'sqg_{}'.format(sqg_train.columns.values[1])]
        train = pd.merge(train, sqg_train, on='userid', how='left')

test_files = os.listdir('./test/')
for test_f in test_files:
    if test_f.startswith('sqg_cat'):
        sqg_test = pd.read_csv('./test/'+test_f)
        sqg_test.columns = ['userid', 'sqg_{}'.format(sqg_test.columns.values[1])]
        test = pd.merge(test, sqg_test, on='userid', how='left')

print train.shape, test.shape

(40307, 211) (10076, 210)


## 添加原始主要特征

In [13]:
from get_datasets import load_datasets

with open('../train_0.97329.pkl', "rb") as f:
    lq_train = cPickle.load(f)
with open('../test_0.97329.pkl', "rb") as f:
    lq_test = cPickle.load(f)

used_features = ['userid', 'goodorder_vs_actiontype_1_ratio', 'isOrder', 'total_good_order_ratio',
                'history_order_type_sum_lg0', 'goodorder_vs_actiontype_5_ratio', 'finalAction_4',
                'action_type_511_time_delta_min', 'finalAction_8', 'action_type_511_time_delta_max',
                'goodorder_vs_actiontype_6_ratio', 'type_1to4valuemean', 'histord_sum_cont4',
                'age_lg90', 'three_gram_789_last_time', 'three_gram_789_time_mean',
                'action_type_710_time_delta_min', 'three_gram_456_time_min', 
                 'three_gram_action_456_ratio', 'pay_money_min_delta']

train = pd.merge(train, lq_train[used_features], on='userid', how='left')
test = pd.merge(test, lq_test[used_features], on='userid', how='left')

hl_train = pd.read_csv(Configure.base_path + 'huang_lin/train_dataHL.csv')
hl_test = pd.read_csv(Configure.base_path + 'huang_lin/test_dataHL.csv')
used_features = ['userid', 'endclosest_3_4', 'endclosest_3_3', 'actionType_recent_time_3',
                 'type_3to4valueamin', 'minute_last', 'rangeTime_to_end5','typeend3_4diff',
                 'endclosest_1_3', 'action_end1Browse', 'actionType_recent_time_4',
                 'actionType5_Per', 'type_4to4valuemean', 'hour_last', 'type_3to4valuemean',
                 'actionType_recent_time_1', 'endclosest_1_4', 'rangeTime_to_begin6',
                 'endclosest_4_4']
# train = pd.merge(train, hl_train[used_features], on='userid', how='left')
# test = pd.merge(test, hl_test[used_features], on='userid', how='left')

qg_train = pd.read_csv(Configure.base_path + 'sun_qian_guo/train.csv')
qg_test = pd.read_csv(Configure.base_path + 'sun_qian_guo/test.csv')
used_features = ['userid', 'recentmin5', 'lastBrowse', 'browseLastTwo',
                '1To6Timemin', '5To6Timemin', 'recentmax5', 'typeDismax6',
                'recentmin1', '6To5Timemin']
train = pd.merge(train, qg_train[used_features], on='userid', how='left')
test = pd.merge(test, qg_test[used_features], on='userid', how='left')

print train.shape, test.shape

(40307, 239) (10076, 238)


In [14]:
# 删除一些特征
remove_features = ['histord_sum_cont4', 'action_type_511_time_delta_min', 'finalAction_8',
                  'goodorder_vs_actiontype_6_ratio', 'type_1to4valuemean', 'total_good_order_ratio',
                  'goodorder_vs_actiontype_5_ratio']
train.drop(remove_features, axis=1, inplace=True)
test.drop(remove_features, axis=1, inplace=True)

print train.shape, test.shape

(40307, 232) (10076, 231)


In [15]:
# plt.figure(figsize=(8,6))
# sns.heatmap(train.corr(), xticklabels=False, yticklabels=False)
# plt.show()

# Save level-1 dataset

In [16]:
with open('level1_train.pkl', "wb") as f:
    cPickle.dump(train, f, -1)
    
with open('level1_test.pkl', "wb") as f:
    cPickle.dump(test, f, -1)

# Level 2

In [17]:
print train.shape, test.shape

(40307, 232) (10076, 231)


In [18]:
import xgboost as xgb

y_train_all = train['orderType']

submit_df = pd.DataFrame({'userid': test['userid']})

train.drop(['orderType', 'userid'], axis=1, inplace=True)
test.drop(['userid'], axis=1, inplace=True)

train = train[test.columns.values]

df_columns = train.columns.values
print('train: {}, test: {}, feature count: {}, orderType 1:0 = {:.5f}'.format(
    train.shape, test.shape, len(df_columns), 1.0*sum(y_train_all) / len(y_train_all)))

train: (40307, 230), test: (10076, 230), feature count: 230, orderType 1:0 = 0.16436


In [19]:
1.0*sum(y_train_all) / (len(y_train_all) - sum(y_train_all))

0.1966925954515765

In [21]:
xgb_params = {
    'alpha': 0.1,
    'booster': 'gbtree',
    'colsample_bytree': 0.7,
    'eta': 0.01,
    'eval_metric': 'auc',
    'gamma': 2,
    'gpu_id': 2,
    'lambda': 1,
    'max_depth': 10,
    'min_child_weight': 3,
    'nthread': -1,
    'objective': 'binary:logistic',
    'scale_pos_weight': 1,
    'silent': 1,
    'subsample': 0.6,
    'updater': 'grow_gpu'
}

print('---> cv train to choose best_num_boost_round')
dtrain_all = xgb.DMatrix(train.values, y_train_all, feature_names=df_columns)
dtest = xgb.DMatrix(test, feature_names=df_columns)

# 4-折 valid 为 10077 和 测试集大小一致
nfold = 3
cv_result = xgb.cv(dict(xgb_params),
                   dtrain_all,
                   nfold=nfold,
                   stratified=True,
                   num_boost_round=4000,
                   early_stopping_rounds=100,
                   verbose_eval=100,
                   show_stdv=False,
                   )
best_num_boost_rounds = len(cv_result)
mean_train_logloss = cv_result.loc[best_num_boost_rounds-11 : best_num_boost_rounds-1, 'train-auc-mean'].mean()
mean_test_logloss = cv_result.loc[best_num_boost_rounds-11 : best_num_boost_rounds-1, 'test-auc-mean'].mean()
print('best_num_boost_rounds = {}'.format(best_num_boost_rounds))
print('mean_train_auc = {:.7f} , mean_test_auc = {:.7f}\n'.format(mean_train_logloss, mean_test_logloss))

---> cv train to choose best_num_boost_round
[0]	train-auc:0.96922	test-auc:0.961531
[100]	train-auc:0.986698	test-auc:0.972854
[200]	train-auc:0.990352	test-auc:0.973611
[300]	train-auc:0.992603	test-auc:0.973728
[400]	train-auc:0.994027	test-auc:0.973879
[500]	train-auc:0.994957	test-auc:0.973923
best_num_boost_rounds = 443
mean_train_auc = 0.9943990 , mean_test_auc = 0.9739445



- 调参之前：    mean_train_auc = 0.9907898 , mean_test_auc = 0.9733552
- 调参之后：    mean_train_auc = 0.9810015 , mean_test_auc = 0.9736901
- 删除特征：    mean_train_auc = 0.9799322 , mean_test_auc = 0.9736705
- 10个lq-catboost：mean_train_auc = 0.9802385 , mean_test_auc = 0.9740707
- mean_train_auc = 0.9834573 , mean_test_auc = 0.9741202


In [22]:
print('---> training on total dataset')

model = xgb.train(dict(xgb_params),
                  dtrain_all,
                  num_boost_round=best_num_boost_rounds)

---> training on total dataset


In [23]:
import time

print('---> predict test')
y_pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)

submit_df['orderType'] = y_pred
submission_path = '../../result/{}_scaleposweight_{}.csv'.format('stacking', 1)

submit_df.to_csv(submission_path, index=False, columns=['userid', 'orderType'])
print('-------- predict and valid check  ------')
print('test count mean: {:.6f} , std: {:.6f}'.format(np.mean(submit_df['orderType']), np.std(submit_df['orderType'])))
print('done.')

---> predict test
-------- predict and valid check  ------
test count mean: 0.165553 , std: 0.313583
done.
