In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

from sklearn.metrics import log_loss
import lightgbm as lgb

In [2]:
all_data_path = feature_data_path + 'all_data_all_features_new_0512.pkl'
all_data = load_pickle(all_data_path)

In [3]:
target = 'is_trade'

features = load_pickle('all_features_day_4567.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

len(features), len(categorical_feature)

(253, 1)

In [4]:
# data = all_data[(all_data.day == 7) & (all_data.is_trade >= 0)]
# data.loc[data.hour <= 1, hour_features] = np.NAN

# train_data = data[(data.hour < 11)]
# test_data = data[data.hour >= 11]


# data = all_data[(all_data.is_trade >= 0) & (all_data.day != 5) & (all_data.day != 6)]

data = all_data[(all_data.is_trade >= 0)]
all_data = None
gc.collect()

train_data = data[(data.hour < 10) | (data.day < 7)]
test_data_1 = data[(data.hour >= 10) & (data.day == 7)]
# test_data_2 = data[(data.hour == 11) & (data.day == 7)]

data = None
gc.collect()


lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features, categorical_feature=categorical_feature)
lgb_test_data_1 = lgb_train_data.create_valid(
    test_data_1[features], label=test_data_1[target])
# lgb_test_data_2 = lgb_train_data.create_valid(
#     test_data_2[features], label=test_data_2[target])

train_data.shape, test_data_1.shape

((5099741, 573), (269737, 573))

### 取7号上午数据进行训练和测试，以11点为分割

In [6]:
param = {'application': 'binary',
         'metric': 'binary_logloss',

         'learning_rate': 0.05,

         'max_depth': 5,
         'num_leaves': 20,

         'min_data_in_leaf': 200,
         'min_sum_hessian_in_leaf': 0.001,
         'min_gain_to_split': 0.1,

         'feature_fraction': 0.8,
         'bagging_fraction': 0.7,
         'bagging_freq': 1,

         'lambda_l2': 10,
         'max_bin': 63,

         'device': 'gpu',
         'gpu_use_dp': True,
         
#          'num_threads': 1,
         }


valid_sets = [lgb_train_data, lgb_test_data_1,]

bst = lgb.train(param, lgb_train_data, 
                valid_sets=valid_sets, 
                num_boost_round=4000, 
                early_stopping_rounds=300, 
                verbose_eval=20, 
                categorical_feature=categorical_feature
               )

Training until validation scores don't improve for 300 rounds.
[20]	training's binary_logloss: 0.230852	valid_1's binary_logloss: 0.291315
[40]	training's binary_logloss: 0.113643	valid_1's binary_logloss: 0.199055
[60]	training's binary_logloss: 0.0782813	valid_1's binary_logloss: 0.17545
[80]	training's binary_logloss: 0.0675685	valid_1's binary_logloss: 0.16931
[100]	training's binary_logloss: 0.0643111	valid_1's binary_logloss: 0.167356
[120]	training's binary_logloss: 0.0631356	valid_1's binary_logloss: 0.166227
[140]	training's binary_logloss: 0.0625712	valid_1's binary_logloss: 0.16558
[160]	training's binary_logloss: 0.0622344	valid_1's binary_logloss: 0.165076
[180]	training's binary_logloss: 0.0619756	valid_1's binary_logloss: 0.164686
[200]	training's binary_logloss: 0.0617979	valid_1's binary_logloss: 0.164396
[220]	training's binary_logloss: 0.0616491	valid_1's binary_logloss: 0.16415
[240]	training's binary_logloss: 0.0615266	valid_1's binary_logloss: 0.163977
[260]	train

In [6]:
predict_train = bst.predict(train_data[features])
predict_test = bst.predict(test_data_1[features])

train_ctr = float(sum(predict_train)/float(len(predict_train)))
test_ctr = float(sum(predict_test)/float(len(predict_test)))
train_ctr, test_ctr

(0.015202567101391316, 0.04391718330513906)

## 特征重要度

In [7]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

Unnamed: 0,index,features_20,importance_20
0,4,item_sales_level,1319
1,126,hour_smooth_CTR,1208
2,53,user_click_time_gap_after,1143
3,0,item_id,949
4,12,shop_id,942
5,134,user_gender_id_shop_id_smooth_CTR,929
6,135,user_gender_id_item_id_smooth_CTR,929
7,41,user_id_click_day_mean,871
8,140,user_occupation_id_shop_id_smooth_CTR,831
9,2,item_city_id,803


## 输出临时文件，用于组合

In [13]:
test_data_1['predicted_score'] = bst.predict(test_data_1[features])

test_data_1[['instance_id', 'predicted_score']].to_csv(
    'yym-0514-hour-1-step-003-loss-160395.txt', index=False, sep=' ')

In [7]:
test_data_1['predicted_score'] = bst.predict(test_data_1[features])

test_data_1[['instance_id', 'predicted_score']].to_csv(
    'yym-0514-hour-2-step-005-loss-162316.txt', index=False, sep=' ')


In [None]:
[1497]	training's binary_logloss: 0.0589682	valid_1's binary_logloss: 0.162316   1780
    