In [38]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import gc

import os

import lightgbm as lgb

In [2]:
data_path = './datasets/'

In [3]:
train = pd.read_csv(os.path.join(data_path,'train.csv'))

In [4]:
test = pd.read_csv(os.path.join(data_path,'test.csv'))

特征字段	字段描述
id	用户行为id，唯一表示，无重复
date	行为时间，精确到秒
user_id	用户id
product	产品
campaign_id	活动id
webpage_id	网页id
product_category_id	产品类型id
user_group_id	用户所属群组id
gender	性别
age_level	年龄等级
user_depth	用户价值深度
var_1	匿名特征
isClick	是否点击，1为点击，0为未点击

In [5]:
train

Unnamed: 0,id,date,user_id,product,campaign_id,webpage_id,product_category_id,user_group_id,gender,age_level,user_depth,var_1,isClick
0,0,07-02 00:00,0,0,0,0,0,10.0,Female,4.0,3.0,0,0
1,1,07-02 00:00,1,0,1,1,1,8.0,Female,2.0,2.0,0,0
2,2,07-02 00:00,1,0,0,0,0,8.0,Female,2.0,2.0,0,0
3,3,07-02 00:00,2,1,0,0,2,3.0,Male,3.0,3.0,1,0
4,4,07-02 00:01,3,0,2,2,2,2.0,Male,2.0,3.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
391820,391820,07-06 23:59,44038,1,9,8,0,3.0,Male,3.0,2.0,1,0
391821,391821,07-06 23:59,135658,3,9,8,1,5.0,Male,5.0,3.0,1,0
391822,391822,07-06 23:59,135658,3,9,8,1,5.0,Male,5.0,3.0,1,0
391823,391823,07-06 23:59,39562,0,3,0,1,1.0,Male,1.0,3.0,0,0


In [6]:
test

Unnamed: 0,id,date,user_id,product,campaign_id,webpage_id,product_category_id,user_group_id,gender,age_level,user_depth,var_1
0,391825,07-07 00:00,94025,0,2,2,2,2.0,Male,2.0,3.0,1
1,391826,07-07 00:00,135659,0,2,2,2,10.0,Female,4.0,3.0,1
2,391827,07-07 00:00,135659,4,9,8,0,10.0,Female,4.0,3.0,1
3,391828,07-07 00:00,2569,3,9,8,1,3.0,Male,3.0,3.0,0
4,391829,07-07 00:00,53895,7,4,3,4,3.0,Male,3.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
71461,463286,07-07 23:59,88462,3,9,8,1,4.0,Male,4.0,3.0,0
71462,463287,07-07 23:59,126268,1,9,8,0,10.0,Female,4.0,3.0,1
71463,463288,07-07 23:59,108640,5,9,8,0,2.0,Male,2.0,3.0,0
71464,463289,07-07 23:59,108640,5,9,8,1,2.0,Male,2.0,3.0,0


In [7]:
data = pd.concat([train,test],ignore_index=True)

In [8]:
data

Unnamed: 0,id,date,user_id,product,campaign_id,webpage_id,product_category_id,user_group_id,gender,age_level,user_depth,var_1,isClick
0,0,07-02 00:00,0,0,0,0,0,10.0,Female,4.0,3.0,0,0.0
1,1,07-02 00:00,1,0,1,1,1,8.0,Female,2.0,2.0,0,0.0
2,2,07-02 00:00,1,0,0,0,0,8.0,Female,2.0,2.0,0,0.0
3,3,07-02 00:00,2,1,0,0,2,3.0,Male,3.0,3.0,1,0.0
4,4,07-02 00:01,3,0,2,2,2,2.0,Male,2.0,3.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
463286,463286,07-07 23:59,88462,3,9,8,1,4.0,Male,4.0,3.0,0,
463287,463287,07-07 23:59,126268,1,9,8,0,10.0,Female,4.0,3.0,1,
463288,463288,07-07 23:59,108640,5,9,8,0,2.0,Male,2.0,3.0,0,
463289,463289,07-07 23:59,108640,5,9,8,1,2.0,Male,2.0,3.0,0,


In [9]:
data['day_id'] = data['date'].apply(lambda x:int(x[3:5]))

In [10]:
data['minute_id']=data['date'].apply(lambda x:int(x[-5:-3])*60 + int(x[-2:]))

In [11]:
data

Unnamed: 0,id,date,user_id,product,campaign_id,webpage_id,product_category_id,user_group_id,gender,age_level,user_depth,var_1,isClick,day_id,minute_id
0,0,07-02 00:00,0,0,0,0,0,10.0,Female,4.0,3.0,0,0.0,2,0
1,1,07-02 00:00,1,0,1,1,1,8.0,Female,2.0,2.0,0,0.0,2,0
2,2,07-02 00:00,1,0,0,0,0,8.0,Female,2.0,2.0,0,0.0,2,0
3,3,07-02 00:00,2,1,0,0,2,3.0,Male,3.0,3.0,1,0.0,2,0
4,4,07-02 00:01,3,0,2,2,2,2.0,Male,2.0,3.0,1,0.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463286,463286,07-07 23:59,88462,3,9,8,1,4.0,Male,4.0,3.0,0,,7,1439
463287,463287,07-07 23:59,126268,1,9,8,0,10.0,Female,4.0,3.0,1,,7,1439
463288,463288,07-07 23:59,108640,5,9,8,0,2.0,Male,2.0,3.0,0,,7,1439
463289,463289,07-07 23:59,108640,5,9,8,1,2.0,Male,2.0,3.0,0,,7,1439


### 特征工程

#### 构建用户每天前后两次浏览行为之间的时间间隔及其衍生均值特征, 因为用户浏览时间往往和其是否点击具有相关性

In [12]:
data['minute_id'].shift(-1)

0            0.0
1            0.0
2            0.0
3            1.0
4            1.0
           ...  
463286    1439.0
463287    1439.0
463288    1439.0
463289    1439.0
463290       NaN
Name: minute_id, Length: 463291, dtype: float64

In [13]:
data.groupby(['user_id','day_id'])['minute_id'].apply(lambda x :x.shift(-1) -x)

0         NaN
1         0.0
2         2.0
3         NaN
4         NaN
         ... 
463286    NaN
463287    NaN
463288    0.0
463289    NaN
463290    NaN
Name: minute_id, Length: 463291, dtype: float64

In [14]:
data.groupby(['user_id','day_id'])['minute_id'].agg(lambda x :x.shift(-1) -x)

user_id  day_id
0        2                                              NaN
         6                                              NaN
1        2                                  [0.0, 2.0, nan]
2        2                                              NaN
         3         [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 6.0, nan]
                                     ...                   
150342   7                                       [0.0, nan]
150343   7                                              NaN
150344   7                                              NaN
150345   7                                              NaN
150346   7                                       [0.0, nan]
Name: minute_id, Length: 249625, dtype: object

In [15]:
data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)

0         NaN
1         0.0
2         2.0
3         NaN
4         NaN
         ... 
463286    NaN
463287    NaN
463288    0.0
463289    NaN
463290    NaN
Name: minute_id, Length: 463291, dtype: float64

In [16]:
data['minute_id_diff'] = data.groupby(['user_id','day_id'])['minute_id'].transform(lambda x :x.shift(-1) -x)

In [17]:
data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')

0           NaN
1           1.0
2           1.0
3           NaN
4           NaN
          ...  
463286      NaN
463287     34.0
463288    154.8
463289    154.8
463290     26.0
Name: minute_id_diff, Length: 463291, dtype: float64

In [18]:
data.groupby(['user_id','day_id'])['minute_id_diff'].agg('mean')

user_id  day_id
0        2              NaN
         6              NaN
1        2         1.000000
2        2              NaN
         3         1.142857
                     ...   
150342   7         0.000000
150343   7              NaN
150344   7              NaN
150345   7              NaN
150346   7         0.000000
Name: minute_id_diff, Length: 249625, dtype: float64

In [19]:
data.groupby(['user_id','day_id'])['minute_id_diff'].mean()

user_id  day_id
0        2              NaN
         6              NaN
1        2         1.000000
2        2              NaN
         3         1.142857
                     ...   
150342   7         0.000000
150343   7              NaN
150344   7              NaN
150345   7              NaN
150346   7         0.000000
Name: minute_id_diff, Length: 249625, dtype: float64

In [20]:
data['minute_id_diff_mean'] = data.groupby(['user_id','day_id'])['minute_id_diff'].transform('mean')

In [21]:
for col in ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id']:
    data['{}_count'.format(col)] = data.groupby(col)['minute_id'].transform('count')
    

In [22]:
data.columns

Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
       'product_category_id', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
       'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
       'product_count', 'campaign_id_count', 'webpage_id_count',
       'product_category_id_count', 'user_group_id_count'],
      dtype='object')

In [23]:
ycol = 'isClick'
drop_list = [
    ycol,
    'id',
    'date'
]

features = [x for x in data.columns if x not in drop_list]

In [24]:
print("使用{} 个特征:{}".format(len(features),features))

使用20 个特征:['user_id', 'product', 'campaign_id', 'webpage_id', 'product_category_id', 'user_group_id', 'gender', 'age_level', 'user_depth', 'var_1', 'day_id', 'minute_id', 'minute_id_diff', 'minute_id_diff_mean', 'user_id_count', 'product_count', 'campaign_id_count', 'webpage_id_count', 'product_category_id_count', 'user_group_id_count']


In [25]:
# 下列特征转化为类别特征
categorical_feature = [
    'user_id',
    'product',
    'campaign_id',
    'webpage_id',
    'product_category_id',
    'user_group_id',
    'gender',
    'age_level',
    'user_depth',
]

for col in categorical_feature:
    data[col] = data[col].astype('category')

In [26]:
train = data[~data[ycol].isnull()]

In [27]:
test = data[data[ycol].isnull()]

In [28]:
train.shape

(391825, 23)

In [29]:
test.shape

(71466, 23)

In [30]:
del(data)

In [31]:
gc.collect()

0

#### 五折交叉验证训练模型

In [33]:
NFLOD =5
random_state = 2021
KF = StratifiedKFold(n_splits = NFLOD,shuffle=True,random_state=random_state)

params_lgb = {
    'boosting':'gbdt',
    'objective':'binary',
    'metric':'auc',
    'force_row_size':True,
    'random_state':random_state,
    'learning_rate':0.03,
    'max_depth':8,
    'num_leaves':40,
    'subsamples':0.8,
    'subsample_freq':3,
    'colsample_bytree':0.8,
    'n_jobs':-1,
    'verbose':-1
}

In [34]:
oof_lgb = np.zeros(len(train))

In [35]:
predictions_lgb = np.zeros(len(test))

In [36]:
df_importance_list = []

In [40]:
# 五折交叉验证
for fold_,(trn_idx,val_idx) in enumerate(KF.split(train[features],train[ycol])):
    print('------------fold{}-----------'.format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features] , label=train.iloc[trn_idx][ycol])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=train.iloc[val_idx][ycol],reference=trn_data)
    
    clf_lgb = lgb.train(
        params = params_lgb,
        train_set = trn_data,
        valid_sets = [trn_data,val_data],
        valid_names = ('train','val'),
        num_boost_round = 50000,
        early_stopping_rounds = 200,
        verbose_eval = 100,
    )
    
    oof_lgb[val_idx] = clf_lgb.predict(train.iloc[val_idx][features],num_iteration=clf_lgb.best_iteration)
    predictions_lgb[:] += (clf_lgb.predict(test[features],num_iteration = clf_lgb.best_iteration)/ NFLOD)
    
    df_importance = pd.DataFrame({
        'column':features,
        'importance_split':clf_lgb.feature_importance(importance_type = 'split'),
        'importance_gain':clf_lgb.feature_importance(importance_type = 'gain')
    })
    
    df_importance_list.append(df_importance)

------------fold1-----------




Training until validation scores don't improve for 200 rounds
[100]	train's auc: 0.677174	val's auc: 0.628599
[200]	train's auc: 0.690672	val's auc: 0.63003
[300]	train's auc: 0.702212	val's auc: 0.631905
[400]	train's auc: 0.711116	val's auc: 0.632443
[500]	train's auc: 0.7182	val's auc: 0.632929
[600]	train's auc: 0.725516	val's auc: 0.632789
Early stopping, best iteration is:
[499]	train's auc: 0.718131	val's auc: 0.632934
------------fold2-----------
Training until validation scores don't improve for 200 rounds
[100]	train's auc: 0.679197	val's auc: 0.620851
[200]	train's auc: 0.693258	val's auc: 0.622213
[300]	train's auc: 0.703885	val's auc: 0.623473
[400]	train's auc: 0.714255	val's auc: 0.62404
[500]	train's auc: 0.720288	val's auc: 0.624115
[600]	train's auc: 0.726659	val's auc: 0.624428
[700]	train's auc: 0.732689	val's auc: 0.624295
[800]	train's auc: 0.73929	val's auc: 0.624619
[900]	train's auc: 0.744832	val's auc: 0.624663
[1000]	train's auc: 0.754675	val's auc: 0.62499
[

In [42]:
valid_auc_score = roc_auc_score(train[ycol],oof_lgb)

In [43]:
valid_auc_score

0.629614298701527

#### 特征重要性

In [49]:
df_features_importances= pd.concat(df_importance_list)

In [53]:
df_features_importance= df_features_importances.groupby('column').mean().reset_index()

In [54]:
df_features_importance

Unnamed: 0,column,importance_split,importance_gain
0,age_level,40.2,407.363945
1,campaign_id,565.6,17419.711442
2,campaign_id_count,1049.4,5384.477741
3,day_id,1194.6,9639.523684
4,gender,54.6,253.445407
5,minute_id,3935.8,22181.02733
6,minute_id_diff,3431.4,23767.593485
7,minute_id_diff_mean,3201.4,21643.519522
8,product,489.6,6321.783218
9,product_category_id,176.8,2965.427346


In [55]:
df_features_importance.sort_values('importance_gain',ascending=False)

Unnamed: 0,column,importance_split,importance_gain
15,user_id,3435.2,67579.315769
16,user_id_count,2032.4,31719.678195
6,minute_id_diff,3431.4,23767.593485
5,minute_id,3935.8,22181.02733
7,minute_id_diff_mean,3201.4,21643.519522
1,campaign_id,565.6,17419.711442
3,day_id,1194.6,9639.523684
11,product_count,1215.4,7685.000637
10,product_category_id_count,970.4,6571.430858
8,product,489.6,6321.783218


#### 预测

In [58]:
test.head()

Unnamed: 0,id,date,user_id,product,campaign_id,webpage_id,product_category_id,user_group_id,gender,age_level,...,day_id,minute_id,minute_id_diff,minute_id_diff_mean,user_id_count,product_count,campaign_id_count,webpage_id_count,product_category_id_count,user_group_id_count
391825,391825,07-07 00:00,94025,0,2,2,2,2.0,Male,2.0,...,7,0,,,4,163501,95973,95973,113812,137278.0
391826,391826,07-07 00:00,135659,0,2,2,2,10.0,Female,4.0,...,7,0,0.0,0.0,2,163501,95973,95973,113812,13779.0
391827,391827,07-07 00:00,135659,4,9,8,0,10.0,Female,4.0,...,7,0,,0.0,2,22479,35531,35531,133290,13779.0
391828,391828,07-07 00:00,2569,3,9,8,1,3.0,Male,3.0,...,7,0,,,9,109574,35531,35531,65865,140317.0
391829,391829,07-07 00:00,53895,7,4,3,4,3.0,Male,3.0,...,7,0,53.0,53.0,3,21452,28826,28826,81141,140317.0


In [59]:
test.columns

Index(['id', 'date', 'user_id', 'product', 'campaign_id', 'webpage_id',
       'product_category_id', 'user_group_id', 'gender', 'age_level',
       'user_depth', 'var_1', 'isClick', 'day_id', 'minute_id',
       'minute_id_diff', 'minute_id_diff_mean', 'user_id_count',
       'product_count', 'campaign_id_count', 'webpage_id_count',
       'product_category_id_count', 'user_group_id_count'],
      dtype='object')

In [60]:
test.loc[:,ycol] = predictions_lgb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [61]:
test.loc[:,ycol]

391825    0.082708
391826    0.084275
391827    0.046645
391828    0.050657
391829    0.126505
            ...   
463286    0.079529
463287    0.041204
463288    0.046967
463289    0.048984
463290    0.078428
Name: isClick, Length: 71466, dtype: float64

In [62]:
test[['id',ycol]]

Unnamed: 0,id,isClick
391825,391825,0.082708
391826,391826,0.084275
391827,391827,0.046645
391828,391828,0.050657
391829,391829,0.126505
...,...,...
463286,463286,0.079529
463287,463287,0.041204
463288,463288,0.046967
463289,463289,0.048984


In [64]:
test[['user_id','product','campaign_id',ycol]].to_csv('res.csv',index=False)