# 排序模型

通过召回的操作，我们已经进行了问题规模的缩减，对于每个用户，选择出了 N 篇文章作为了候选集，并基于召回的候选集构建了与用户历史相关的特征，以及用户本身的属性特征，文章本身的属性特征，以及用户与文章之间的特征。

下面就是使用机器学习模型来对构造好的特征进行学习，然后对测试集进行预测，得到测试集中的每个候选集用户点击的概率，返回点击概率最大的 topk 个文章，作为最终的结果。

排序阶段选择了三个比较有代表性的排序模型，它们分别是：

1. LGB 的排序模型
2. LGB 的分类模型
3. 深度学习的分类模型 DIN

得到了最终的排序模型输出的结果之后，还选择了两种比较经典的模型集成的方法：

1. 输出结果加权融合
2. Staking（将模型的输出结果再使用一个简单模型进行预测）

In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

## 1 读取排序特征

In [2]:
data_path = '../data/'
save_path = '../tmp_results/'
offline = False

In [3]:
# 重新读取数据的时候，发现 click_article_id 是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None
    
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']

In [4]:
trn_user_item_feats_df

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,words_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,36162,0.031281,7920000,43,0.031281,0.031281,0.031281,0.031281,0.838929,...,1,25,2,0.343715,0.992865,266.000000,43,1508177171000,205,0
1,0,3244,0.163423,34968000,9,0.163423,0.163423,0.163423,0.163423,1.288687,...,1,25,2,0.343715,0.992865,266.000000,1,1508220059000,153,0
2,1,63746,0.060468,37324000,14,0.060468,0.060468,0.060468,0.060468,0.659610,...,1,25,6,0.343618,0.992721,169.000000,133,1508142585000,162,0
3,1,63795,0.105641,897477000,52,0.105641,0.105641,0.105641,0.105641,0.439456,...,1,25,6,0.343618,0.992721,169.000000,133,1507282432000,228,0
4,2,168564,0.619400,169421000,71,0.619400,0.619400,0.619400,0.619400,0.617176,...,1,25,2,0.343651,0.992020,210.000000,297,1508007750000,276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291030,199997,225499,0.825229,377348000,25,0.825229,0.825229,0.825229,0.825229,2.027832,...,1,16,1,0.019385,0.989068,174.500000,354,1506636615000,181,0
291031,199998,236648,0.137503,31230000,11,0.137503,0.137503,0.137503,0.137503,0.946671,...,1,25,5,0.152570,0.990487,189.475000,375,1508104674000,184,0
291032,199998,260604,-0.009700,411590000,53,-0.009700,-0.009700,-0.009700,-0.009700,0.957341,...,1,25,5,0.152570,0.990487,189.475000,395,1507661854000,226,0
291033,199999,218355,0.155957,259000,111,0.155957,0.155957,0.155957,0.155957,0.742256,...,1,13,1,0.152674,0.990746,200.272727,352,1508155745000,202,0


---

## 2 返回排序后的结果

In [27]:
def submit(recall_df: pd.DataFrame, topk: int = 5, model_name: str = None) -> None:
    """
    提交推荐结果，生成提交格式的 DataFrame 并保存为 CSV 文件。

    Args:
        recall_df (`pd.DataFrame`): 包含用户推荐结果的 DataFrame，必须包含 'user_id' 和 'pred_score' 列。
        topk (`int`, optional): 每个用户推荐的文章数量，默认为 5。
        model_name (`str`, optional): 模型名称，用于保存文件时的命名。

    Returns:
        `None`: 该函数没有返回值，但会将结果保存为 CSV 文件。
    """
    # 根据用户 ID 和预测分数排序
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'], ascending=[True, False])
    
    # 为每个用户的推荐结果打排名
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 检查每个用户至少有 topk 篇文章
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk, f"Some users do not have at least {topk} articles."
    
    # 删除预测分数列
    del recall_df['pred_score']
    
    # 选择排名在 topk 以内的文章并调整 DataFrame 格式
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    # 转换列名，去掉多层索引
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                     3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    # 保存结果为 CSV 文件
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)  # 保存不包含索引，包含标题

In [28]:
def norm_sim(sim_df: pd.Series, weight: float = 0.0) -> pd.Series:
    """
    对相似度得分进行归一化处理。

    Args:
        sim_df (`pd.Series`): 输入的相似度得分 Series。
        weight (`float`, optional): 归一化后加上的权重，默认为 0.0。

    Returns:
        `pd.Series`: 归一化后的相似度得分 Series。
    """
    
    # 获取相似度得分的最小值和最大值
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    
    # 如果最大值等于最小值，则所有得分都设置为 1.0
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        # 否则进行线性归一化处理
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    # 将指定的权重加到归一化后的相似度得分上
    sim_df = sim_df.apply(lambda sim: sim + weight)  # 加上权重

    return sim_df  # 返回归一化后的相似度得分

---

## 3 LGB 排序模型

In [29]:
# 防止中间出错之后重新读取数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [30]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'words_hbo', 'category_id', 'created_at_ts','words_count']

In [31]:
# 排序模型分组
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

In [32]:
# 排序模型定义
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  

In [33]:
trn_user_item_feats_df

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,words_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,36162,0.031281,7920000,43,0.031281,0.031281,0.031281,0.031281,0.838929,...,1,25,2,0.343715,0.992865,266.000000,43,1508177171000,205,0
1,0,3244,0.163423,34968000,9,0.163423,0.163423,0.163423,0.163423,1.288687,...,1,25,2,0.343715,0.992865,266.000000,1,1508220059000,153,0
2,1,63746,0.060468,37324000,14,0.060468,0.060468,0.060468,0.060468,0.659610,...,1,25,6,0.343618,0.992721,169.000000,133,1508142585000,162,0
3,1,63795,0.105641,897477000,52,0.105641,0.105641,0.105641,0.105641,0.439456,...,1,25,6,0.343618,0.992721,169.000000,133,1507282432000,228,0
4,2,168564,0.619400,169421000,71,0.619400,0.619400,0.619400,0.619400,0.617176,...,1,25,2,0.343651,0.992020,210.000000,297,1508007750000,276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291030,199997,225499,0.825229,377348000,25,0.825229,0.825229,0.825229,0.825229,2.027832,...,1,16,1,0.019385,0.989068,174.500000,354,1506636615000,181,0
291031,199998,236648,0.137503,31230000,11,0.137503,0.137503,0.137503,0.137503,0.946671,...,1,25,5,0.152570,0.990487,189.475000,375,1508104674000,184,0
291032,199998,260604,-0.009700,411590000,53,-0.009700,-0.009700,-0.009700,-0.009700,0.957341,...,1,25,5,0.152570,0.990487,189.475000,395,1507661854000,226,0
291033,199999,218355,0.155957,259000,111,0.155957,0.155957,0.155957,0.155957,0.742256,...,1,13,1,0.152674,0.990746,200.272727,352,1508155745000,202,0


In [12]:
# 排序模型训练
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4124
[LightGBM] [Info] Number of data points in the train set: 291035, number of used features: 23


In [13]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)

In [14]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

In [15]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ])
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])

# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均，将预测的 score 和对应的 rank 特征保存，可以用于后面的 staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4133
[LightGBM] [Info] Number of data points in the train set: 232954, number of used features: 23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4131
[LightGBM] [Info] Number of data points in the train set: 232735, number of used features: 23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4133
[LightGBM] [Info] Number of data points in the train set: 232931, number of used features: 23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009379 seconds.
You can set `force_col_wis

In [None]:
# 预测结果重新排序, 及生成提交结果
# 单模型生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

---

## 4 LGB 分类模型

In [16]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  

In [17]:
# 模型训练
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])

[LightGBM] [Info] Number of positive: 64190, number of negative: 226845
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046175
[LightGBM] [Debug] init for col-wise cost 0.000006 seconds, init for row-wise cost 0.009883 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4125
[LightGBM] [Info] Number of data points in the train set: 291035, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.220558 -> initscore=-1.262420
[LightGBM] [Info] Start training from score -1.262420
[LightGBM] [Debug] Re-bagging, using 203724 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[LightGBM] [Debug] Re-bagging, using 203329 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Re-bagging, using 203611 data to train
[LightGBM] [Debug] Tr

In [18]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)

In [19]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

In [20]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ])
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均，将预测的 score 和对应的 rank 特征保存，可以用于后面的 staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)

[LightGBM] [Info] Number of positive: 51564, number of negative: 181390
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.046228
[LightGBM] [Debug] init for col-wise cost 0.000005 seconds, init for row-wise cost 0.007199 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4132
[LightGBM] [Info] Number of data points in the train set: 232954, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221348 -> initscore=-1.257826
[LightGBM] [Info] Start training from score -1.257826
[LightGBM] [Debug] Re-bagging, using 163213 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Re-bagging, using 162840 data to train
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Re-bagging, using 162810 data to train
[LightGBM] [Debug] Trai

In [21]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

---

## 5 DIN 模型

### 5.1 用户的历史点击行为列表

这个是为后面的DIN模型服务的。

In [34]:
if offline:
    all_data = pd.read_csv('../data/train_click_log.csv')
else:
    trn_data = pd.read_csv('../data/train_click_log.csv')
    tst_data = pd.read_csv('../data/testA_click_log.csv')
    all_data = pd.concat([trn_data, tst_data], ignore_index=True)

In [35]:
hist_click = all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']

In [36]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()

In [37]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

In [21]:
trn_user_item_feats_df_din_model

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,words_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,36162,0.031281,7920000,43,0.031281,0.031281,0.031281,0.031281,0.838929,...,1,25,2,0.343715,0.992865,266.000000,43,1508177171000,205,0
1,0,3244,0.163423,34968000,9,0.163423,0.163423,0.163423,0.163423,1.288687,...,1,25,2,0.343715,0.992865,266.000000,1,1508220059000,153,0
2,1,63746,0.060468,37324000,14,0.060468,0.060468,0.060468,0.060468,0.659610,...,1,25,6,0.343618,0.992721,169.000000,133,1508142585000,162,0
3,1,63795,0.105641,897477000,52,0.105641,0.105641,0.105641,0.105641,0.439456,...,1,25,6,0.343618,0.992721,169.000000,133,1507282432000,228,0
4,2,168564,0.619400,169421000,71,0.619400,0.619400,0.619400,0.619400,0.617176,...,1,25,2,0.343651,0.992020,210.000000,297,1508007750000,276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291030,199997,225499,0.825229,377348000,25,0.825229,0.825229,0.825229,0.825229,2.027832,...,1,16,1,0.019385,0.989068,174.500000,354,1506636615000,181,0
291031,199998,236648,0.137503,31230000,11,0.137503,0.137503,0.137503,0.137503,0.946671,...,1,25,5,0.152570,0.990487,189.475000,375,1508104674000,184,0
291032,199998,260604,-0.009700,411590000,53,-0.009700,-0.009700,-0.009700,-0.009700,0.957341,...,1,25,5,0.152570,0.990487,189.475000,395,1507661854000,226,0
291033,199999,218355,0.155957,259000,111,0.155957,0.155957,0.155957,0.155957,0.742256,...,1,13,1,0.152674,0.990746,200.272727,352,1508155745000,202,0


### 5.2 DIN 模型介绍

我们下面尝试使用 DIN 模型，DIN 的全称是 Deep Interest Network，这是阿里 2018 年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型，它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性，来计算用户兴趣的表示向量。

具体来说就是通过<font color=red>引入局部激活单元，通过软搜索历史行为的相关部分来关注相关的用户兴趣，并采用加权和来获得有关候选广告的用户兴趣的表示</font>。与候选广告相关性较高的行为会获得较高的激活权重，并支配着用户兴趣。

该表示向量在不同广告上有所不同，大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合，我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。

该模型的结构如下：

<img src="../image/din.png">

我们这里直接调包来使用这个模型，关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用。

deepctr 的函数原型如下：

> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False, dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice", att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, task='binary'):

- `dnn_feature_columns`：特征列，包含数据所有特征的列表
- `history_feature_list`：用户历史行为列，反应用户历史行为的特征的列表
- `dnn_use_bn`：是否使用 BatchNormalization
- `dnn_hidden_units`：全连接层网络的层数和每一层神经元的个数， 一个列表或者元组
- `dnn_activation_relu`：全连接网络的激活单元类型
- `att_hidden_size`：注意力层的全连接网络的层数和每一层神经元的个数
- `att_activation`：注意力层的激活单元类型
- `att_weight_normalization`：是否归一化注意力得分
- `l2_reg_dnn`：全连接网络的正则化系数
- `l2_reg_embedding`：embedding向量的正则化稀疏
- `dnn_dropout`：全连接网络的神经元的失活概率
- `task`：任务，可以是分类，也可是是回归

在具体使用的时候，我们必须要传入特征列和历史行为列，但是再传入之前，我们需要进行一下特征列的预处理。具体如下：

首先，我们要处理数据集得到数据，由于我们是基于用户过去的行为去预测用户是否点击当前文章，所以我们需要把数据的特征列划分成数值型特征、离散型特征和历史行为特征列三部分，对于每一部分，DIN 模型的处理会有不同。

#### 5.2.1 离散型特征

在我们的数据集中就是那些类别型的特征，比如 user_id 这种，这种类别型特征，我们首先要经过 Embedding 处理得到每个特征的低维稠密型表示，既然要经过 Embedding，那么我们就需要为每一列的类别特征的取值建立一个字典，并指明 Embedding 维度，所以在使用 deepctr 的 DIN 模型准备数据的时候，我们需要通过 SparseFeat 函数指明这些类别型特征，这个函数的传入参数就是列名，列的唯一取值（建立字典用）和 Embedding 维度。

#### 5.2.2 用户历史行为特征

比如文章 id，文章的类别等这种，同样的我们需要先经过 Embedding 处理。只不过和上面不一样的地方是，对于这种特征，我们在得到每个特征的 Embedding 表示之后，还需要通过一个 Attention_layer 计算用户的历史行为和当前候选文章的相关性以此得到当前用户的 Embedding 向量。

这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣，并且随着用户的不同的历史点击来变化，去动态的模拟用户兴趣的变化过程。

这类特征对于每个用户都是一个历史行为序列，对于每个用户，历史行为序列长度会不一样，可能有的用户点击的历史文章多，有的点击的历史文章少，所以我们还需要把这个长度统一起来。

在为 DIN 模型准备数据的时候，我们首先要通过 SparseFeat 函数指明这些类别型特征，然后还需要通过 VarLenSparseFeat 函数再进行序列填充，使得每个用户的历史序列一样长，所以这个函数参数中会有个 maxlen，来指明序列的最大长度是多少。

#### 5.2.3 连续型特征列

对于连续型特征列，只需要用 DenseFeat 函数来指明列名和维度即可。

处理完特征列之后，我们把相应的数据与列进行对应，就得到了最后的数据。

下面根据具体的代码感受一下，逻辑是这样，首先我们需要写一个数据准备函数，在这里面就是根据上面的具体步骤准备数据，得到数据和特征列，然后就是建立 DIN 模型并训练，最后基于模型进行测试。

---

## 5.3 导包

In [2]:
# 导入deepctr
from deepctr.models import DIN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
from tensorflow.keras import backend as K

from sklearn.preprocessing import MinMaxScaler

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [38]:
def get_din_feats_columns(df: pd.DataFrame, 
                          dense_fea: list, 
                          sparse_fea: list, 
                          behavior_fea: list, 
                          his_behavior_fea: list, 
                          emb_dim: int = 32, 
                          max_len: int = 100) -> tuple:
    """
    数据准备函数，用于构建 Deep Interest Network (DIN) 的输入特征。

    Args:
        df (`pd.DataFrame`): 输入的数据集。
        dense_fea (`list`): 数值型特征列的名称列表。
        sparse_fea (`list`): 离散型特征列的名称列表。
        behavior_fea (`list`): 用户的候选行为特征列名称列表。
        his_behavior_fea (`list`): 用户的历史行为特征列名称列表。
        emb_dim (`int`, optional): embedding 的维度，默认为 32。
        max_len (`int`, optional): 用户序列的最大长度，默认为 100。

    Returns:
        `tuple`: 包含两个元素的元组：
            - `x`: 特征字典，包含模型输入特征。
            - `dnn_feature_columns`: DNN 特征列的列表。
    """
    # 构建离散特征列，添加 vocabulary_size 和 embedding_dim
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]
    
    # 构建密集特征列
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_fea]
    
    # 构建变长离散特征列
    var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,
                                    embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in his_behavior_fea]
    
    # 合并所有特征列
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    
    # 初始化特征字典
    x = {}
    for name in get_feature_names(dnn_feature_columns):
        if name in his_behavior_fea:
            # 处理历史行为序列，进行填充以满足最大长度
            his_list = [l for l in df[name]]
            x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')  # 生成二维数组
        else:
            x[name] = df[name].values  # 直接提取特征列的值
    
    return x, dnn_feature_columns  # 返回特征字典和特征列列表

In [39]:
# 把特征分开
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

behavior_fea = ['click_article_id']

hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'words_hbo','words_count']

In [40]:
# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理
mm = MinMaxScaler()

# 下面是做一些特殊处理，当在其他的地方出现无效值的时候，不处理无法进行归一化，刚开始可以先把他注释掉
# 在运行了下面的代码之后如果发现报错，应该先去想办法处理如何不出现inf之类的值
# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
    trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
    
    tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])

In [41]:
# 准备训练数据
x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = trn_user_item_feats_df_din_model['label'].values

if offline:
    # 准备验证数据
    x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, 
                                                   sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
    y_val = val_user_item_feats_df_din_model['label'].values
    
dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, 
                                               sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)

In [26]:
def check_for_none(feature_columns: list) -> bool:
    """
    检查特征列中是否存在 None 值。

    Args:
        feature_columns (`list`): 特征列的列表，可以包含不同类型的特征对象。

    Returns:
        `bool`: 如果发现 None 值，返回 True；否则返回 False。
    """
    # 遍历所有特征
    for feature in feature_columns:
        if feature is None:
            print("Found None in feature columns.")
            return True  # 发现 None 值，返回 True
        
        # 进一步检查特征的属性是否为 None
        if isinstance(feature, SparseFeat):
            if feature.name is None or feature.vocabulary_size is None or feature.embedding_dim is None:
                print(f"Found None in SparseFeat properties: {feature}")
                return True  # SparseFeat 属性中发现 None 值
        elif isinstance(feature, DenseFeat):
            if feature.name is None or feature.dimension is None:
                print(f"Found None in DenseFeat properties: {feature}")
                return True  # DenseFeat 属性中发现 None 值
        elif isinstance(feature, VarLenSparseFeat):
            if feature.sparsefeat is None or feature.maxlen is None:
                print(f"Found None in VarLenSparseFeat properties: {feature}")
                return True  # VarLenSparseFeat 属性中发现 None 值

    print("No None values found in feature columns.")
    return False  # 没有发现 None 值，返回 False

In [None]:
# 调用检查函数
contains_none = check_for_none(dnn_feature_columns)  # 检查特征列中是否有 None 值

No None values found in feature columns.


In [None]:
dnn_feature_columns

[SparseFeat(name='user_id', vocabulary_size=50001, embedding_dim=32, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002CE901AB160>, embedding_name='user_id', group_name='default_group', trainable=True),
 SparseFeat(name='click_article_id', vocabulary_size=16427, embedding_dim=32, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002CE901AAC50>, embedding_name='click_article_id', group_name='default_group', trainable=True),
 SparseFeat(name='category_id', vocabulary_size=243, embedding_dim=32, use_hash=False, vocabulary_path=None, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.initializers_v1.RandomNormal object at 0x000002CE901AAE30>, embedding_name='category_id', group_name='default_group', trainable=True),
 SparseFeat(name='click_environmen

In [None]:
# 建立模型
model = DIN(dnn_feature_columns, behavior_fea)

# 查看模型结构
print(model.summary())

# 模型编译
model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])

In [36]:
print("Max index in x_trn:", x_trn.max())
print("Max index in y_trn:", y_trn.max())

AttributeError: 'dict' object has no attribute 'max'

In [None]:
# 模型训练
if offline:
    history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)
else:
    # 也可以使用上面的语句用自己采样出来的验证集
    # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)
    history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)

Epoch 1/2


InvalidArgumentError: Graph execution error:

Detected at node 'model/sparse_seq_emb_hist_click_article_id/embedding_lookup_2' defined at (most recent call last):
    File "f:\anaconda\anaconda3\envs\tf\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "f:\anaconda\anaconda3\envs\tf\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
      app.launch_new_instance()
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
      app.start()
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelapp.py", line 739, in start
      self.io_loop.start()
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tornado\platform\asyncio.py", line 205, in start
      self.asyncio_loop.run_forever()
    File "f:\anaconda\anaconda3\envs\tf\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "f:\anaconda\anaconda3\envs\tf\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "f:\anaconda\anaconda3\envs\tf\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue
      await self.process_one()
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one
      await dispatch(*args)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell
      await result
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request
      await super().execute_request(stream, ident, parent)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request
      reply_content = await reply_content
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute
      res = shell.run_cell(
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell
      return super().run_cell(*args, **kwargs)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell
      result = self._run_cell(
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell
      result = runner(coro)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner
      coro.send(None)
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "f:\anaconda\anaconda3\envs\tf\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\dell\AppData\Local\Temp\ipykernel_15764\3012139636.py", line 7, in <module>
      history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\training.py", line 1187, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\training.py", line 857, in train_function
      return step_function(self, iterator)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\training.py", line 847, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\training.py", line 840, in run_step
      outputs = model.train_step(data)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\training.py", line 797, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\functional.py", line 419, in call
      return self._run_internal_graph(
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\functional.py", line 555, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\dell\AppData\Roaming\Python\Python310\site-packages\tensorflow\python\keras\layers\embeddings.py", line 191, in call
      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
Node: 'model/sparse_seq_emb_hist_click_article_id/embedding_lookup_2'
indices[0,0] = 183530 is not in [0, 16427)
	 [[{{node model/sparse_seq_emb_hist_click_article_id/embedding_lookup_2}}]] [Op:__inference_train_function_4676]

: 

---

## 6 模型融合

### 6.1 加权融合

In [18]:
# 读取多个模型的排序结果文件
lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')
lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')
# din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')

# 这里也可以换成交叉验证输出的测试结果进行加权融合

In [19]:
rank_model = {'lgb_ranker': lgb_ranker, 
              'lgb_cls': lgb_cls}

In [34]:
def get_ensumble_predict_topk(rank_model: dict, topk: int = 5) -> None:
    """
    生成集成模型的前 K 个预测结果。

    Args:
        rank_model (`dict`): 包含多个模型的字典，必须包含 'lgb_cls'、'din_ranker' 和 'lgb_ranker'。
        topk (`int`, optional): 每个用户推荐的文章数量，默认为 5。

    Returns:
        `None`: 该函数没有返回值，但会调用 `submit` 函数保存结果。
    """
    # 合并 LightGBM 分类模型和 DIN 排序器的预测结果
    final_recall = rank_model['lgb_cls']
    # final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
    
    # 对 LightGBM 排序器的预测分数进行归一化处理
    rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))
    
    # 合并所有模型的预测结果
    final_recall = pd.concat([final_recall, rank_model['lgb_ranker']])
    # final_recall = final_recall.append(rank_model['lgb_ranker'])
    
    # 按用户和文章 ID 分组，计算预测分数的总和
    final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()
    
    # 调用提交函数，将结果保存为 CSV 文件
    submit(final_recall, topk=topk, model_name='ensemble_fuse')

In [30]:
get_ensumble_predict_topk(rank_model)

ValueError: Transform function failed

### 6.2 Stacking

- 读取多个模型的交叉验证生成的结果文件

In [22]:
# 训练集
trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')
trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')
# trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')

# 测试集
tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
# tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')

In [24]:
# 将多个模型输出的特征进行拼接
finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]
finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]

for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_trn_ranker_feats[col_name] = trn_model[feat]

for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_tst_ranker_feats[col_name] = tst_model[feat]

In [25]:
# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测
# 这里需要注意的是，在做交叉验证的时候可以构造多一些与输出预测值相关的特征，来丰富这里简单模型的特征
from sklearn.linear_model import LogisticRegression

feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1']

trn_x = finall_trn_ranker_feats[feat_cols]
trn_y = finall_trn_ranker_feats['label']

tst_x = finall_tst_ranker_feats[feat_cols]

# 定义模型
lr = LogisticRegression()

# 模型训练
lr.fit(trn_x, trn_y)

# 模型预测
finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]

In [28]:
# 预测结果重新排序, 及生成提交结果
rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='ensumble_staking')