In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [2]:
all_data_path = feature_data_path + 'all_data_all_features_new_0512.pkl'
data = load_pickle(all_data_path)

In [3]:
target = 'is_trade'

features = load_pickle('all_features_day_4567.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

len(features), len(categorical_feature)

(253, 1)

In [4]:

# train_data = data[(data.is_trade != -1) & ((data.hour > 1) | ((data.hour == 1)&(data.minute >= 30)))]
# train_data = data[(data.is_trade >= 0) & (data.hour >= 1)]
# train_data.loc[train_data.hour <= 1, hour_features] = np.NAN


# train_data = data[(data.is_trade >= 0) & (data.day != 5) & (data.day != 6)]
train_data = data[(data.is_trade >= 0)]
test_data = data[data.is_trade == -2]

data = None
gc.collect()

print(train_data.shape)
print(test_data.shape)

(5369478, 573)
(1209768, 573)


In [5]:
from sklearn.metrics import log_loss
import lightgbm as lgb

lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features,  categorical_feature=categorical_feature)

gc.collect()

param = {'application': 'binary',
         'metric': 'binary_logloss',

         'learning_rate': 0.1,

         'max_depth': 5,
         'num_leaves': 20,

         'min_data_in_leaf': 200,
         'min_sum_hessian_in_leaf': 0.01,
         'min_gain_to_split': 0.0,

         'feature_fraction': 0.8,
         'bagging_fraction': 0.7,
         'bagging_freq': 1,

         'lambda_l2': 10,
         'max_bin': 63,

         'device': 'gpu',
         'gpu_use_dp': False,
         
#          'num_threads': 16,
         }

# param = {'application': 'binary',
#          'metric': 'binary_logloss',

#          'learning_rate': 0.03,

#          'max_depth': 5,
#          'num_leaves': 20,

#          'min_data_in_leaf': 100,
#          'min_sum_hessian_in_leaf': 0.001,
#          'min_gain_to_split': 0.1,

#          'feature_fraction': 0.8,
#          'bagging_fraction': 0.7,
#          'bagging_freq': 1,

#          'lambda_l2': 10,
#          'max_bin': 63,

#          'device': 'gpu',
#          'gpu_use_dp': True,
         
# #          'num_threads': 1,
#          }

valid_sets = [lgb_train_data,]

bst = lgb.train(param, lgb_train_data, valid_sets=valid_sets, 
                categorical_feature=categorical_feature,
                num_boost_round=500, 
                verbose_eval=50,)


# predict_train = bst.predict(train_data[features])
predict_test = bst.predict(test_data[features])

# train_ctr = float(sum(predict_train)/float(len(predict_train)))
test_ctr = float(sum(predict_test)/float(len(predict_test)))

# train_ctr, 
test_ctr

[50]	training's binary_logloss: 0.069434
[100]	training's binary_logloss: 0.0669751
[150]	training's binary_logloss: 0.0664071
[200]	training's binary_logloss: 0.0660748
[250]	training's binary_logloss: 0.0658134
[300]	training's binary_logloss: 0.0656018
[350]	training's binary_logloss: 0.065412
[400]	training's binary_logloss: 0.0652294
[450]	training's binary_logloss: 0.0650547
[500]	training's binary_logloss: 0.0648963


0.037756352252505024

In [6]:
test_data['predicted_score'] = bst.predict(test_data[features])
ctr = test_data.groupby(['hour'])['predicted_score'].mean().reset_index().rename(columns={0: 'ctr'})
ctr

Unnamed: 0,hour,predicted_score
0,12,0.043579
1,13,0.04294
2,14,0.042111
3,15,0.042799
4,16,0.04329
5,17,0.044002
6,18,0.043936
7,19,0.043686
8,20,0.037848
9,21,0.031661


In [7]:
cnt = test_data.groupby(['hour'])['predicted_score'].sum().reset_index().rename(columns={0: 'ctr'})
cnt

Unnamed: 0,hour,predicted_score
0,12,3637.002171
1,13,3924.773135
2,14,3936.142016
3,15,3904.629794
4,16,3713.780702
5,17,3265.279252
6,18,3335.432108
7,19,4426.603065
8,20,4867.479316
9,21,4410.533529


In [8]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

Unnamed: 0,index,features_20,importance_20
0,4,item_sales_level,294
1,142,hour_smooth_CTR,189
2,58,user_click_time_gap_after_global,188
3,151,user_gender_id_item_id_smooth_CTR,152
4,150,user_gender_id_shop_id_smooth_CTR,146
5,53,user_click_time_gap_after,139
6,156,user_occupation_id_shop_id_smooth_CTR,131
7,91,user_category2_label_click_time_gap_after_global,120
8,157,user_occupation_id_item_id_smooth_CTR,113
9,10,user_star_level,111
