In [None]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

from sklearn.metrics import log_loss
import lightgbm as lgb

In [None]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

In [None]:
target = 'is_trade'

features = load_pickle('all_features_day_7.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

# 只使用原始特征
# features = load_pickle('original_features.pkl')
# categorical_feature = ['user_gender_id', 'user_occupation_id']

len(features), len(categorical_feature)

### 取7号上午数据进行训练和测试，以11点为分割

In [None]:
data = all_data[(all_data.day == 7) & (all_data.is_trade != -1)]
# train_data = data[(data.hour < 11)]
# test_data = data[data.hour >= 11]

train_data = data[(data.hour < 11) & (data.hour > 1)]
# train_data = data[(data.hour < 11) & (data.hour > 1) | ((data.hour == 1)&(data.minute >=30))]
test_data = data[data.hour >= 11]

# data = all_data[(all_data.is_trade != -1)]
# train_data = data[((data.day == 6) & (data.hour > 20)) | ((data.hour < 11) & (data.day == 7))]
# test_data = data[(data.hour >= 11) & (data.day == 7)]

lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features, categorical_feature=categorical_feature)
lgb_test_data = lgb_train_data.create_valid(
    test_data[features], label=test_data[target])

print(train_data.shape)
print(test_data.shape)

In [None]:
param = {'application': 'binary',
         'metric': 'binary_logloss',

         'learning_rate': 0.05,

         'max_depth': 5,
         'num_leaves': 18,

         'min_data_in_leaf': 100,
         'min_sum_hessian_in_leaf': 0.01,
         'min_gain_to_split': 0.1,

         'feature_fraction': 0.9,
         'bagging_fraction': 0.7,
         'bagging_freq': 1,

         'lambda_l2': 10,
         'max_bin': 63,

         'device': 'gpu',
         'gpu_use_dp': True,
         }


valid_sets = [lgb_train_data, lgb_test_data]

bst = lgb.train(param, lgb_train_data, valid_sets=valid_sets, 
                num_boost_round=2000, 
                early_stopping_rounds=300, 
                verbose_eval=50, 
                categorical_feature=categorical_feature
               )

In [28]:
predict_train = bst.predict(train_data[features])
predict_test = bst.predict(test_data[features])

train_ctr = float(sum(predict_train)/float(len(predict_train)))
test_ctr = float(sum(predict_test)/float(len(predict_test)))
train_ctr, test_ctr

(0.04577492183308594, 0.04448399618933612)

## 特征重要度

In [32]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

Unnamed: 0,index,features_20,importance_20
0,76,user_click_time_gap_after,336
1,46,item_property_topic_k_10,276
2,127,user_age_level_item_sales_level_hour_CTR,269
3,158,item_id_smooth_CTR,217
4,14,shop_id,201
5,20,shop_score_description,198
6,0,item_id,195
7,7,user_id,182
8,244,item_id_click_hour,174
9,19,shop_score_delivery,172


## 输出临时文件，用于组合

In [9]:
test_data['predicted_score'] = bst.predict(test_data[features])

test_data[['instance_id', 'predicted_score']].to_csv(
    'day567-hour-2-depth-5-iter-1355-loss-163204.txt', index=False, sep=' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
