# 排序模型

In [21]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

## 读取排序特征

In [22]:
data_path = 'D:/Desktop/competition/news_article_rs/data/' # 天池平台路径
save_path = 'D:/Desktop/competition/news_article_rs/res/'  # 天池平台路径
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [23]:
# 重新读取数据的时候，发现click_article_id是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)
val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

del tst_user_item_feats_df['label']

In [24]:
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [25]:
# 排序结果归一化
def norm_sim(sim_df, weight=0.0):
    # print(sim_df.head())
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    sim_df = sim_df.apply(lambda sim: sim + weight)  # plus one
    return sim_df

## LGB排序模型

In [26]:
# 防止中间出错之后重新读取数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [27]:
trn_user_item_feats_df_rank_model.columns

Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',
       'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',
       'click_size', 'time_diff_mean_x', 'active_level', 'user_time_hob1',
       'user_time_hob2', 'words_hbo', 'category_id', 'created_at_ts',
       'words_count', 'user_num', 'time_diff_mean_y', 'hot_level',
       'is_cat_hab'],
      dtype='object')

In [28]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score']

In [29]:
# 排序模型分组
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

In [30]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

offline = False
k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=100, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)

[1]	valid_0's ndcg@1: 0.99748	valid_0's ndcg@2: 0.99907	valid_0's ndcg@3: 0.99907	valid_0's ndcg@4: 0.99907	valid_0's ndcg@5: 0.99907
Training until validation scores don't improve for 100 rounds
[2]	valid_0's ndcg@1: 0.9979	valid_0's ndcg@2: 0.999225	valid_0's ndcg@3: 0.999225	valid_0's ndcg@4: 0.999225	valid_0's ndcg@5: 0.999225
[3]	valid_0's ndcg@1: 0.99884	valid_0's ndcg@2: 0.999572	valid_0's ndcg@3: 0.999572	valid_0's ndcg@4: 0.999572	valid_0's ndcg@5: 0.999572
[4]	valid_0's ndcg@1: 0.999	valid_0's ndcg@2: 0.999631	valid_0's ndcg@3: 0.999631	valid_0's ndcg@4: 0.999631	valid_0's ndcg@5: 0.999631
[5]	valid_0's ndcg@1: 0.99908	valid_0's ndcg@2: 0.99966	valid_0's ndcg@3: 0.99966	valid_0's ndcg@4: 0.99966	valid_0's ndcg@5: 0.99966
[6]	valid_0's ndcg@1: 0.9991	valid_0's ndcg@2: 0.999668	valid_0's ndcg@3: 0.999668	valid_0's ndcg@4: 0.999668	valid_0's ndcg@5: 0.999668
[7]	valid_0's ndcg@1: 0.9991	valid_0's ndcg@2: 0.999668	valid_0's ndcg@3: 0.999668	valid_0's ndcg@4: 0.999668	valid_0's nd

[16]	valid_0's ndcg@1: 0.99874	valid_0's ndcg@2: 0.999535	valid_0's ndcg@3: 0.999535	valid_0's ndcg@4: 0.999535	valid_0's ndcg@5: 0.999535
[17]	valid_0's ndcg@1: 0.99878	valid_0's ndcg@2: 0.99955	valid_0's ndcg@3: 0.99955	valid_0's ndcg@4: 0.99955	valid_0's ndcg@5: 0.99955
[18]	valid_0's ndcg@1: 0.99876	valid_0's ndcg@2: 0.999542	valid_0's ndcg@3: 0.999542	valid_0's ndcg@4: 0.999542	valid_0's ndcg@5: 0.999542
[19]	valid_0's ndcg@1: 0.9988	valid_0's ndcg@2: 0.999557	valid_0's ndcg@3: 0.999557	valid_0's ndcg@4: 0.999557	valid_0's ndcg@5: 0.999557
[20]	valid_0's ndcg@1: 0.9988	valid_0's ndcg@2: 0.999557	valid_0's ndcg@3: 0.999557	valid_0's ndcg@4: 0.999557	valid_0's ndcg@5: 0.999557
[21]	valid_0's ndcg@1: 0.99878	valid_0's ndcg@2: 0.99955	valid_0's ndcg@3: 0.99955	valid_0's ndcg@4: 0.99955	valid_0's ndcg@5: 0.99955
[22]	valid_0's ndcg@1: 0.99876	valid_0's ndcg@2: 0.999542	valid_0's ndcg@3: 0.999542	valid_0's ndcg@4: 0.999542	valid_0's ndcg@5: 0.999542
[23]	valid_0's ndcg@1: 0.99874	valid_

[32]	valid_0's ndcg@1: 0.99918	valid_0's ndcg@2: 0.999697	valid_0's ndcg@3: 0.999697	valid_0's ndcg@4: 0.999697	valid_0's ndcg@5: 0.999697
[33]	valid_0's ndcg@1: 0.99914	valid_0's ndcg@2: 0.999683	valid_0's ndcg@3: 0.999683	valid_0's ndcg@4: 0.999683	valid_0's ndcg@5: 0.999683
[34]	valid_0's ndcg@1: 0.99916	valid_0's ndcg@2: 0.99969	valid_0's ndcg@3: 0.99969	valid_0's ndcg@4: 0.99969	valid_0's ndcg@5: 0.99969
[35]	valid_0's ndcg@1: 0.99914	valid_0's ndcg@2: 0.999683	valid_0's ndcg@3: 0.999683	valid_0's ndcg@4: 0.999683	valid_0's ndcg@5: 0.999683
[36]	valid_0's ndcg@1: 0.99916	valid_0's ndcg@2: 0.99969	valid_0's ndcg@3: 0.99969	valid_0's ndcg@4: 0.99969	valid_0's ndcg@5: 0.99969
[37]	valid_0's ndcg@1: 0.99914	valid_0's ndcg@2: 0.999683	valid_0's ndcg@3: 0.999683	valid_0's ndcg@4: 0.999683	valid_0's ndcg@5: 0.999683
[38]	valid_0's ndcg@1: 0.99916	valid_0's ndcg@2: 0.99969	valid_0's ndcg@3: 0.99969	valid_0's ndcg@4: 0.99969	valid_0's ndcg@5: 0.99969
[39]	valid_0's ndcg@1: 0.99916	valid_0'

[48]	valid_0's ndcg@1: 0.99926	valid_0's ndcg@2: 0.999727	valid_0's ndcg@3: 0.999727	valid_0's ndcg@4: 0.999727	valid_0's ndcg@5: 0.999727
[49]	valid_0's ndcg@1: 0.99926	valid_0's ndcg@2: 0.999727	valid_0's ndcg@3: 0.999727	valid_0's ndcg@4: 0.999727	valid_0's ndcg@5: 0.999727
[50]	valid_0's ndcg@1: 0.99924	valid_0's ndcg@2: 0.99972	valid_0's ndcg@3: 0.99972	valid_0's ndcg@4: 0.99972	valid_0's ndcg@5: 0.99972
[51]	valid_0's ndcg@1: 0.99922	valid_0's ndcg@2: 0.999712	valid_0's ndcg@3: 0.999712	valid_0's ndcg@4: 0.999712	valid_0's ndcg@5: 0.999712
[52]	valid_0's ndcg@1: 0.99924	valid_0's ndcg@2: 0.99972	valid_0's ndcg@3: 0.99972	valid_0's ndcg@4: 0.99972	valid_0's ndcg@5: 0.99972
[53]	valid_0's ndcg@1: 0.99922	valid_0's ndcg@2: 0.999712	valid_0's ndcg@3: 0.999712	valid_0's ndcg@4: 0.999712	valid_0's ndcg@5: 0.999712
[54]	valid_0's ndcg@1: 0.99922	valid_0's ndcg@2: 0.999712	valid_0's ndcg@3: 0.999712	valid_0's ndcg@4: 0.999712	valid_0's ndcg@5: 0.999712
[55]	valid_0's ndcg@1: 0.99926	vali

[64]	valid_0's ndcg@1: 0.99908	valid_0's ndcg@2: 0.99966	valid_0's ndcg@3: 0.99966	valid_0's ndcg@4: 0.99966	valid_0's ndcg@5: 0.99966
[65]	valid_0's ndcg@1: 0.99908	valid_0's ndcg@2: 0.99966	valid_0's ndcg@3: 0.99966	valid_0's ndcg@4: 0.99966	valid_0's ndcg@5: 0.99966
[66]	valid_0's ndcg@1: 0.99906	valid_0's ndcg@2: 0.999653	valid_0's ndcg@3: 0.999653	valid_0's ndcg@4: 0.999653	valid_0's ndcg@5: 0.999653
[67]	valid_0's ndcg@1: 0.99908	valid_0's ndcg@2: 0.99966	valid_0's ndcg@3: 0.99966	valid_0's ndcg@4: 0.99966	valid_0's ndcg@5: 0.99966
[68]	valid_0's ndcg@1: 0.99906	valid_0's ndcg@2: 0.999653	valid_0's ndcg@3: 0.999653	valid_0's ndcg@4: 0.999653	valid_0's ndcg@5: 0.999653
[69]	valid_0's ndcg@1: 0.99906	valid_0's ndcg@2: 0.999653	valid_0's ndcg@3: 0.999653	valid_0's ndcg@4: 0.999653	valid_0's ndcg@5: 0.999653
[70]	valid_0's ndcg@1: 0.99908	valid_0's ndcg@2: 0.99966	valid_0's ndcg@3: 0.99966	valid_0's ndcg@4: 0.99966	valid_0's ndcg@5: 0.99966
[71]	valid_0's ndcg@1: 0.99908	valid_0's nd

In [31]:
# 预测结果重新排序, 及生成提交结果
# 单模型生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

## LGB分类模型

In [32]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ],early_stopping_rounds=50, )
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均,将预测的score和对应的rank特征保存，可以用于后面的staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)

[LightGBM] [Info] Number of positive: 15305, number of negative: 226124
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.010450
[LightGBM] [Debug] init for col-wise cost 0.000018 seconds, init for row-wise cost 0.007106 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 4854
[LightGBM] [Info] Number of data points in the train set: 241429, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.063393 -> initscore=-2.692904
[LightGBM] [Info] Start training from score -2.692904
[LightGBM] [Debug] Re-bagging, using 169126 data to train
[LightGBM] [Debug] Trained a tree with leaves = 21 and max_depth = 10
[1]	valid_0's auc: 0.993437	valid_0's binary_logloss: 0.227376
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Re-bagging, using 168785 data to train
[LightGBM] [

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[75]	valid_0's auc: 0.996924	valid_0's binary_logloss: 0.0917692
[LightGBM] [Debug] Re-bagging, using 169024 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[76]	valid_0's auc: 0.996935	valid_0's binary_logloss: 0.0910232
[LightGBM] [Debug] Re-bagging, using 168955 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 15
[77]	valid_0's auc: 0.996944	valid_0's binary_logloss: 0.0903215
[LightGBM] [Debug] Re-bagging, using 168953 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[78]	valid_0's auc: 0.996956	valid_0's binary_logloss: 0.0895853
[LightGBM] [Debug] Re-bagging, using 168792 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[79]	valid_0's auc: 0.996959	valid_0's binary_logloss: 0.0889027
[LightGBM] [Debug] Re-bagging, using 169009 data to train
[LightGBM] [Debug] Trained a tree w

[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[49]	valid_0's auc: 0.99616	valid_0's binary_logloss: 0.117164
[LightGBM] [Debug] Re-bagging, using 168871 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[50]	valid_0's auc: 0.996175	valid_0's binary_logloss: 0.11605
[LightGBM] [Debug] Re-bagging, using 168711 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[51]	valid_0's auc: 0.996199	valid_0's binary_logloss: 0.11493
[LightGBM] [Debug] Re-bagging, using 168577 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[52]	valid_0's auc: 0.996201	valid_0's binary_logloss: 0.113886
[LightGBM] [Debug] Re-bagging, using 169165 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[53]	valid_0's auc: 0.996213	valid_0's binary_logloss: 0.112798
[LightGBM] [Debug] Re-bagging, using 168701 data to train
[LightGBM] [Debug] Trained a tree with leav

[LightGBM] [Debug] Re-bagging, using 169043 data to train
[LightGBM] [Debug] Trained a tree with leaves = 26 and max_depth = 10
[25]	valid_0's auc: 0.995734	valid_0's binary_logloss: 0.151022
[LightGBM] [Debug] Re-bagging, using 169368 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[26]	valid_0's auc: 0.995754	valid_0's binary_logloss: 0.149054
[LightGBM] [Debug] Re-bagging, using 169056 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[27]	valid_0's auc: 0.995765	valid_0's binary_logloss: 0.147195
[LightGBM] [Debug] Re-bagging, using 168586 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[28]	valid_0's auc: 0.995831	valid_0's binary_logloss: 0.145372
[LightGBM] [Debug] Re-bagging, using 168712 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[29]	valid_0's auc: 0.995872	valid_0's binary_logloss: 0.143523
[LightGBM] [Debug] Re-bagging, using 169

[LightGBM] [Debug] Re-bagging, using 168765 data to train
[LightGBM] [Debug] Trained a tree with leaves = 21 and max_depth = 9
[4]	valid_0's auc: 0.994495	valid_0's binary_logloss: 0.209835
[LightGBM] [Debug] Re-bagging, using 168819 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[5]	valid_0's auc: 0.994449	valid_0's binary_logloss: 0.205194
[LightGBM] [Debug] Re-bagging, using 169450 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[6]	valid_0's auc: 0.994355	valid_0's binary_logloss: 0.201004
[LightGBM] [Debug] Re-bagging, using 168850 data to train
[LightGBM] [Debug] Trained a tree with leaves = 29 and max_depth = 12
[7]	valid_0's auc: 0.994817	valid_0's binary_logloss: 0.196786
[LightGBM] [Debug] Re-bagging, using 169208 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[8]	valid_0's auc: 0.99516	valid_0's binary_logloss: 0.193045
[LightGBM] [Debug] Re-bagging, using 169152 data 

[84]	valid_0's auc: 0.997115	valid_0's binary_logloss: 0.0850059
[LightGBM] [Debug] Re-bagging, using 169032 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[85]	valid_0's auc: 0.997144	valid_0's binary_logloss: 0.0843376
[LightGBM] [Debug] Re-bagging, using 168680 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 13
[86]	valid_0's auc: 0.997146	valid_0's binary_logloss: 0.083736
[LightGBM] [Debug] Re-bagging, using 169170 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 19
[87]	valid_0's auc: 0.997169	valid_0's binary_logloss: 0.0830731
[LightGBM] [Debug] Re-bagging, using 168914 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 20
[88]	valid_0's auc: 0.997189	valid_0's binary_logloss: 0.0824237
[LightGBM] [Debug] Re-bagging, using 168351 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 14
[89]	valid_0's auc: 0.9972	valid_0's

[LightGBM] [Debug] Re-bagging, using 169021 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[19]	valid_0's auc: 0.995467	valid_0's binary_logloss: 0.164074
[LightGBM] [Debug] Re-bagging, using 169107 data to train
[LightGBM] [Debug] Trained a tree with leaves = 28 and max_depth = 9
[20]	valid_0's auc: 0.995475	valid_0's binary_logloss: 0.161755
[LightGBM] [Debug] Re-bagging, using 168642 data to train
[LightGBM] [Debug] Trained a tree with leaves = 23 and max_depth = 10
[21]	valid_0's auc: 0.995469	valid_0's binary_logloss: 0.159535
[LightGBM] [Debug] Re-bagging, using 168845 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 15
[22]	valid_0's auc: 0.995592	valid_0's binary_logloss: 0.157445
[LightGBM] [Debug] Re-bagging, using 168800 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[23]	valid_0's auc: 0.995623	valid_0's binary_logloss: 0.155473
[LightGBM] [Debug] Re-bagging, using 1696

In [33]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

## 模型融合

In [34]:
# 读取多个模型的排序结果文件
lgb_ranker = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
lgb_cls = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
# din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')

In [35]:
rank_model = {'lgb_ranker': lgb_ranker, 
              'lgb_cls': lgb_cls}
#               'din_ranker': din_ranker}

In [39]:
lgb_cls

Unnamed: 0,user_id,click_article_id,pred_score,pred_rank
0,250000,160417,0.006458,5.0
1,250003,160417,0.001824,8.0
2,250004,160417,0.001229,4.0
3,250009,160417,0.001938,5.0
4,250010,160417,0.085297,7.0
...,...,...,...,...
1999995,299943,75295,0.000016,39.0
1999996,299943,272640,0.000016,40.0
1999997,299944,106009,0.000394,20.0
1999998,299959,139,0.000084,17.0


In [37]:
def get_ensumble_predict_topk(rank_model, topk=5):
#     final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
#     rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))
    
    final_recall = rank_model['lgb_cls'].append(rank_model['lgb_ranker'])
    final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()
    
    submit(final_recall, topk=topk, model_name='ensemble_fuse')

In [38]:
get_ensumble_predict_topk(rank_model)