In [68]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [94]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

# all_data.drop(['user_click_rank_day', 'user_first_click_day', 'user_last_click_day'], axis=1, inplace=True)
# all_data.drop(['user_click_interval_first_day', 'user_click_interval_last_day'], axis=1, inplace=True)
# all_data.drop(['user_click_global','item_id_user_click_global', 'item_id_user_prob_global','item_brand_id_user_click_global', 'item_brand_id_user_prob_global','shop_id_user_click_global', 'shop_id_user_prob_global'], axis=1, inplace=True)

train_data = all_data[(all_data.day >= 18) & (all_data.day <= 23)]
test_data = all_data[all_data.day == 24]

features = list(all_data.columns)
features.remove('is_trade')
features.remove('instance_id')
target = ['is_trade']
len(features)

191

In [97]:
from sklearn.metrics import log_loss
import xgboost as xgb

dtrain = xgb.DMatrix(train_data[features], train_data[target])
dtest = xgb.DMatrix(test_data[features], test_data[target])

watchlist = [(dtrain, 'train'), (dtest, 'val')]
# watchlist = [(dtrain, 'train')]


params = {
    'n_estimators': 100,
    'max_depth': 3,
    'eta': 0.1,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'subsample': 1.0,
    'colsample_bytree': 0.7,
#     'random_state': 1123,
#     'min_child_weight': 10
    #'scale_pos_weight':0.5
}

xgb_a = xgb.train(params, dtrain,
                  num_boost_round=500,
                  early_stopping_rounds=30,
                  evals=watchlist,
                  verbose_eval=10)

# loss_train = log_loss(train_data[target], lgb_clf.predict_proba(train_data[features]))
# loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))
# loss_train, loss_test


xgb_a_ans = xgb_a.predict(dtest)

[0]	train-logloss:0.605199	val-logloss:0.604895
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 30 rounds.
[10]	train-logloss:0.220631	val-logloss:0.217164
[20]	train-logloss:0.12432	val-logloss:0.118514
[30]	train-logloss:0.097627	val-logloss:0.090598
[40]	train-logloss:0.090596	val-logloss:0.083176
[50]	train-logloss:0.088494	val-logloss:0.081072
[60]	train-logloss:0.087572	val-logloss:0.08029
[70]	train-logloss:0.087	val-logloss:0.079882
[80]	train-logloss:0.086559	val-logloss:0.079576
[90]	train-logloss:0.086222	val-logloss:0.079391
[100]	train-logloss:0.085923	val-logloss:0.079192
[110]	train-logloss:0.08569	val-logloss:0.079086
[120]	train-logloss:0.085456	val-logloss:0.079014
[130]	train-logloss:0.085258	val-logloss:0.078959
[140]	train-logloss:0.085045	val-logloss:0.078904
[150]	train-logloss:0.084879	val-logloss:0.078874
[160]	train-logloss:0.084716	val-logloss:0.07885
[170]	train-logloss:0

In [98]:
# from sklearn.metrics import log_loss
xgb_predict = xgb_a.predict(dtest)

loss_test = log_loss(test_data[target], xgb_predict)
loss_test

0.078819903717876796

In [72]:
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import lightgbm as lgb

lgb_clf = LGBMClassifier(n_estimators=200, max_depth=3, 
#                         num_leaves=50, 
#                        learning_rate=0.1, 
#                        seed=0, nthread=24, subsample=0.8, colsample_bytree=0.9, reg_lambda=0.005, 
                      )

cate_features = ['user_gender_id', 'user_occupation_id']

lgb_clf.fit(train_data[features], train_data['is_trade'],
          #         eval_set=[(train_x.loc[train_x.clickDay>25,feature_group_A],train_x.loc[train_x.clickDay>25,['label']])],
          feature_name=features,
          categorical_feature=cate_features,
          verbose=50,
          )

loss_train = log_loss(train_data[target], lgb_clf.predict_proba(train_data[features]))
loss_test = log_loss(test_data[target], lgb_clf.predict_proba(test_data[features]))

loss_train, loss_test



(0.083928481388601525, 0.078793599286662588)

In [83]:
# from sklearn.metrics import log_loss
lgb_predict_a = lgb_clf.predict_proba(test_data[features])[:, 1]
# lgb_predict_b = lgb_clf_b.predict_proba(test_data[features])[:, 1]
xgb_predict = xgb_a.predict(dtest)

# result = lgb_predict * 0.5 + xgb_predict * 0.5

avg_predict = lgb_predict_a * 0.5 + xgb_predict * 0.5

loss_test = log_loss(test_data[target], avg_predict)
loss_test


0.078717285963032763

In [100]:
train_data = all_data[(all_data.day >= 18) & (all_data.day <= 24)]
test_data = all_data[all_data.day == 25]

dtrain = xgb.DMatrix(train_data[features], train_data[target])
dtest = xgb.DMatrix(test_data[features], test_data[target])

watchlist = [(dtrain, 'train')]

params = {
    'n_estimators': 100,
    'max_depth': 3,
    'eta': 0.1,
    'eval_metric': 'logloss',
    'objective': 'binary:logistic',
    'subsample': 1.0,
    'colsample_bytree': 0.7,
#     'random_state': 1123,
#     'min_child_weight': 10
    #'scale_pos_weight':0.5
}

xgb_a = xgb.train(params, dtrain,
                  num_boost_round=250,
                  early_stopping_rounds=20,
                  evals=watchlist,
                  verbose_eval=10)


lgb_a = LGBMClassifier(n_estimators=200, max_depth=3, 
#                         num_leaves=50, 
#                        learning_rate=0.1, 
#                        seed=0, nthread=24, subsample=0.8, colsample_bytree=0.9, reg_lambda=0.005, 
                      )

cate_features = ['user_gender_id', 'user_occupation_id']

lgb_a.fit(train_data[features], train_data['is_trade'],
          #         eval_set=[(train_x.loc[train_x.clickDay>25,feature_group_A],train_x.loc[train_x.clickDay>25,['label']])],
          feature_name=features,
          categorical_feature=cate_features,
          verbose=50,
          )

lgb_predict_a = lgb_a.predict_proba(test_data[features])[:, 1]
xgb_predict_a = xgb_a.predict(dtest)

avg_predict = lgb_predict_a * 0.5 + xgb_predict_a * 0.5

test_data['predicted_score'] = avg_predict

test_data[['instance_id', 'predicted_score']].to_csv(
    '../submission/20180403.txt', index=False, sep=' ')


lgb_predict_a = lgb_a.predict_proba(train_data[features])[:, 1]
xgb_predict_a = xgb_a.predict(dtrain)

avg_predict = lgb_predict_a * 0.5 + xgb_predict_a * 0.5

loss_train = log_loss(train_data[target], avg_predict)

loss_train


[0]	train-logloss:0.605106
Will train until train-logloss hasn't improved in 20 rounds.
[10]	train-logloss:0.220046
[20]	train-logloss:0.12351
[30]	train-logloss:0.09672
[40]	train-logloss:0.089652
[50]	train-logloss:0.087607
[60]	train-logloss:0.086726
[70]	train-logloss:0.086145
[80]	train-logloss:0.085728
[90]	train-logloss:0.085389
[100]	train-logloss:0.085101
[110]	train-logloss:0.084864
[120]	train-logloss:0.084657
[130]	train-logloss:0.084481
[140]	train-logloss:0.084296
[150]	train-logloss:0.08413
[160]	train-logloss:0.083979
[170]	train-logloss:0.083833
[180]	train-logloss:0.083706
[190]	train-logloss:0.08358
[200]	train-logloss:0.083461
[210]	train-logloss:0.083364
[220]	train-logloss:0.083236
[230]	train-logloss:0.083117
[240]	train-logloss:0.082988
[249]	train-logloss:0.082905


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f6dcafdb898>>
Traceback (most recent call last):
  File "/home/user02/miniconda3/envs/gluon/lib/python3.6/site-packages/xgboost/core.py", line 368, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.082981004116996937

In [102]:
lgb_predict_a = lgb_a.predict_proba(train_data[features])[:, 1]
xgb_predict_a = xgb_a.predict(dtrain)

avg_predict = lgb_predict_a * 0.5 + xgb_predict_a * 0.5

loss_train = log_loss(train_data[target], lgb_predict_a)

loss_train

0.083246709860995124