# 排序模型

通过召回的操作，我们已经进行了问题规模的缩减，对于每个用户，选择出了 N 篇文章作为了候选集，并基于召回的候选集构建了与用户历史相关的特征，以及用户本身的属性特征，文章本身的属性特征，以及用户与文章之间的特征。

下面就是使用机器学习模型来对构造好的特征进行学习，然后对测试集进行预测，得到测试集中的每个候选集用户点击的概率，返回点击概率最大的 topk 个文章，作为最终的结果。

排序阶段选择了三个比较有代表性的排序模型，它们分别是：

1. LGB 的排序模型
2. LGB 的分类模型
3. 深度学习的分类模型 DIN

得到了最终的排序模型输出的结果之后，还选择了两种比较经典的模型集成的方法：

1. 输出结果加权融合
2. Staking（将模型的输出结果再使用一个简单模型进行预测）

In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

## 1 读取排序特征

In [2]:
data_path = './data/'
feature_path = './tmp_results/feature/'
rank_path = './tmp_results/rank/'
offline = False

In [3]:
# 重新读取数据的时候，发现 click_article_id 是一个浮点数，所以将其转换成int类型
trn_user_item_feats_df = pd.read_csv(feature_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)

if offline:
    val_user_item_feats_df = pd.read_csv(feature_path + 'val_user_item_feats_df.csv')
    val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
    val_user_item_feats_df = None

tst_user_item_feats_df = pd.read_csv(feature_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)

# # 做特征的时候为了方便，给测试集也打上了一个无效的标签，这里直接删掉就行
del tst_user_item_feats_df['label']

---

## 2 返回排序后的结果

In [59]:
def submit(recall_df: pd.DataFrame, topk: int = 5, model_name: str = None) -> None:
    """
    提交推荐结果，生成提交格式的 DataFrame 并保存为 CSV 文件。

    Args:
        recall_df (`pd.DataFrame`): 包含用户推荐结果的 DataFrame，必须包含 'user_id' 和 'pred_score' 列。
        topk (`int`, optional): 每个用户推荐的文章数量，默认为 5。
        model_name (`str`, optional): 模型名称，用于保存文件时的命名。

    Returns:
        `None`: 该函数没有返回值，但会将结果保存为 CSV 文件。
    """
    # 根据用户 ID 和预测分数排序
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'], ascending=[True, False])
    
    # 为每个用户的推荐结果打排名
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 检查每个用户至少有 topk 篇文章
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    # assert tmp.min() >= topk, f"Some users do not have at least {topk} articles."
    
    # 删除预测分数列
    del recall_df['pred_score']
    
    # 选择排名在 topk 以内的文章并调整 DataFrame 格式
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    # 转换列名，去掉多层索引
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                     3: 'article_3', 4: 'article_4', 5: 'article_5'})

    # 保存结果为 CSV 文件
    save_name = rank_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)  # 保存不包含索引，包含标题

In [66]:
def norm_sim(sim_df: pd.Series, weight: float = 0.0) -> pd.Series:
    """
    对相似度得分进行归一化处理。

    Args:
        sim_df (`pd.Series`): 输入的相似度得分 Series。
        weight (`float`, optional): 归一化后加上的权重，默认为 0.0。

    Returns:
        `pd.Series`: 归一化后的相似度得分 Series。
    """
    
    # 获取相似度得分的最小值和最大值
    min_sim = sim_df.min()
    max_sim = sim_df.max()
    
    # 如果最大值等于最小值，则所有得分都设置为 1.0
    if max_sim == min_sim:
        sim_df = sim_df.apply(lambda sim: 1.0)
    else:
        # 否则进行线性归一化处理
        sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))

    # 将指定的权重加到归一化后的相似度得分上
    sim_df = sim_df.apply(lambda sim: sim + weight)  # 加上权重

    return sim_df  # 返回归一化后的相似度得分

---

## 3 LGB 排序模型

In [8]:
# 防止中间出错之后重新读取数据
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
    
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()

In [9]:
trn_user_item_feats_df_rank_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48383 entries, 0 to 48382
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              48383 non-null  int64  
 1   click_article_id     48383 non-null  int64  
 2   sim0                 5291 non-null   float64
 3   time_diff0           5291 non-null   float64
 4   word_diff0           5291 non-null   float64
 5   sim_max              5291 non-null   float64
 6   sim_min              5291 non-null   float64
 7   sim_sum              48383 non-null  float64
 8   sim_mean             5291 non-null   float64
 9   score                35983 non-null  float64
 10  rank                 48383 non-null  int64  
 11  label                48383 non-null  float64
 12  click_size           48383 non-null  float64
 13  time_diff_mean       48383 non-null  float64
 14  active_level         48383 non-null  float64
 15  click_environment    48383 non-null 

In [10]:
# 定义特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 
            'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
            'click_environment','click_deviceGroup', 'click_os', 'click_country', 
            'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
            'words_hbo', 'category_id', 'created_at_ts','words_count']

In [11]:
# 排序模型分组
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

if offline:
    val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
    g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values

In [12]:
# 排序模型定义
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  

In [13]:
trn_user_item_feats_df

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_country,click_region,click_referrer_type,user_time_hob1,user_time_hob2,words_hbo,category_id,created_at_ts,words_count,is_cat_hab
0,0,3244,,,,,,0.000000,,0.375211,...,1,25,2,0.343715,0.992865,266.000000,1,1508220059000,153,0
1,5,8603,,,,,,0.000000,,,...,1,25,2,0.343598,0.992789,226.000000,6,1491484758000,164,0
2,6,8017,,,,,,0.000000,,0.078042,...,1,2,1,0.343844,0.991332,188.500000,6,1515513946000,240,0
3,6,10243,,,,,,0.000000,,0.173251,...,1,2,1,0.343844,0.991332,188.500000,6,1507990586000,233,0
4,18,8609,,,,,,0.000000,,0.000000,...,1,13,2,0.343523,0.992784,197.500000,6,1506106322000,174,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48378,199958,8609,,,,,,0.000000,,0.000000,...,1,10,1,0.228723,0.990938,220.562500,6,1506106322000,174,0
48379,199961,1330,,,,,,0.000000,,0.000000,...,1,20,1,0.191252,0.991104,209.250000,1,1471301725000,187,0
48380,199975,598,0.449289,8.110598e+10,55.0,0.449289,0.449289,0.449289,0.449289,0.000000,...,1,25,2,0.178436,0.990914,197.736842,1,1426443309000,143,0
48381,199988,8710,,,,,,0.000000,,,...,6,28,2,0.019358,0.989220,251.500000,6,1519057619000,164,0


In [17]:
# 排序模型训练
if offline:
    lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
                eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
    lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3839
[LightGBM] [Info] Number of data points in the train set: 48383, number of used features: 23


In [24]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(rank_path + 'lgb_ranker_score.csv', index=False)

In [27]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

In [31]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 训练集与验证集的用户分组
    train_idx.sort_values(by=['user_id'], inplace=True)
    g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    valid_idx.sort_values(by=['user_id'], inplace=True)
    g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
    
    # 定义模型
    lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)  
    # 训练模型
    lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
                   eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], 
                   eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ])
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
    
    # 对输出结果进行归一化
    valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    # if not offline:
    #     sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])

# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(rank_path + 'trn_lgb_ranker_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均，将预测的 score 和对应的 rank 特征保存，可以用于后面的 staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(rank_path + 'tst_lgb_ranker_feats.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3822
[LightGBM] [Info] Number of data points in the train set: 38772, number of used features: 23
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3828
[LightGBM] [Info] Number of data points in the train set: 38738, number of used features: 23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3823
[LightGBM] [Info] Number of data points in the train set: 38646, number of used features: 23
[Ligh

In [33]:
# 预测结果重新排序, 及生成提交结果
# 单模型生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')

---

## 4 LGB 分类模型

In [34]:
# 模型及参数的定义
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  

In [35]:
# 模型训练
if offline:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
                    eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], 
                    eval_metric=['auc', ],early_stopping_rounds=50, )
else:
    lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])

[LightGBM] [Info] Number of positive: 835, number of negative: 47548
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.877411
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.381483
[LightGBM] [Debug] init for col-wise cost 0.002857 seconds, init for row-wise cost 0.006472 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 3839
[LightGBM] [Info] Number of data points in the train set: 48383, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017258 -> initscore=-4.042063
[LightGBM] [Info] Start training from score -4.042063
[LightGBM] [Debug] Re-bagging, using 33877 data to train
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 3
[LightGBM] [Debug] R

[LightGBM] [Debug] Trained a tree with leaves = 6 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33979 data to train
[LightGBM] [Debug] Trained a tree with leaves = 6 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33880 data to train
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33921 data to train
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33707 data to train
[LightGBM] [Debug] Trained a tree with leaves = 6 and depth = 5
[LightGBM] [Debug] Re-bagging, using 33842 data to train
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 5
[LightGBM] [Debug] Re-bagging, using 33987 data to train
[LightGBM] [Debug] Trained a tree with leaves = 7 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33781 data to train
[LightGBM] [Debug] Trained a tree with leaves = 6 and depth = 4
[LightGBM] [Debug] Re-bagging, using 33703 data to train
[LightGBM] [Debug] Trained a tre

In [36]:
# 模型预测
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]

# 将这里的排序结果保存一份，用户后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(rank_path + 'lgb_cls_score.csv', index=False)

In [None]:
# 诊断：检查数据分布和标签分布
print("=" * 50)
print("数据诊断")
print("=" * 50)

# 检查标签分布
print(f"\n标签分布:")
print(f"  正样本 (label=1): {(y == 1).sum()} ({(y == 1).mean() * 100:.2f}%)")
print(f"  负样本 (label=0): {(y == 0).sum()} ({(y == 0).mean() * 100:.2f}%)")
print(f"  标签范围: [{y.min()}, {y.max()}]")

# 检查输入特征
print(f"\n输入特征检查:")
for key, val in x.items():
    if isinstance(val, np.ndarray):
        if val.dtype in [np.float32, np.float64]:
            nan_count = np.isnan(val).sum()
            inf_count = np.isinf(val).sum()
            if nan_count > 0 or inf_count > 0:
                print(f"  {key}: ⚠️  NaN={nan_count}, Inf={inf_count}")
            else:
                print(f"  {key}: ✓ shape={val.shape}, min={val.min():.4f}, max={val.max():.4f}, mean={val.mean():.4f}")
        else:
            print(f"  {key}: ✓ shape={val.shape}, dtype={val.dtype}, min={val.min()}, max={val.max()}")

print("\n" + "=" * 50)


In [38]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

In [39]:
# 五折交叉验证，这里的五折交叉是以用户为目标进行五折划分
#  这一部分与前面的单独训练和验证是分开的
def get_kfold_users(trn_df, n=5):
    user_ids = trn_df['user_id'].unique()
    user_set = [user_ids[i::n] for i in range(n)]
    return user_set

k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)

score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])

# 五折交叉验证，并将中间结果保存用于staking
for n_fold, valid_user in enumerate(user_set):
    train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
    valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
    
    # 模型及参数的定义
    lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
                            max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
                            learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)  
    # 训练模型
    lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], 
                          eval_metric=['auc', ])
    
    # 预测验证集结果
    valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], 
                                                              num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
    # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化
    # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
    
    valid_idx.sort_values(by=['user_id', 'pred_score'])
    valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 将验证集的预测结果放到一个列表中，后面进行拼接
    score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
    
    # 如果是线上测试，需要计算每次交叉验证的结果相加，最后求平均
    if not offline:
        sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], 
                                                     num_iteration=lgb_Classfication.best_iteration_)[:,1]
    
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存训练集交叉验证产生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
    
# 测试集的预测结果，多次交叉验证求平均，将预测的 score 和对应的 rank 特征保存，可以用于后面的 staking，这里还可以构造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')

# 保存测试集交叉验证的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)

[LightGBM] [Info] Number of positive: 677, number of negative: 38095
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.876921
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.381270
[LightGBM] [Debug] init for col-wise cost 0.001713 seconds, init for row-wise cost 0.003705 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 3822
[LightGBM] [Info] Number of data points in the train set: 38772, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017461 -> initscore=-4.030167
[LightGBM] [Info] Start training from score -4.030167
[LightGBM] [Debug] Re-bagging, using 27170 data to train
[LightGBM] [Debug] Trained a tree with leaves = 4 and depth = 3
[LightGBM] [Debug] R

In [40]:
# 预测结果重新排序, 及生成提交结果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')

---

## 5 DIN 模型

### 5.1 用户的历史点击行为列表

这个是为后面的DIN模型服务的。

In [4]:
if offline:
    all_data = pd.read_csv('./data/train_click_log.csv')
else:
    trn_data = pd.read_csv('./data/train_click_log.csv')
    tst_data = pd.read_csv('./data/testA_click_log.csv')
    all_data = pd.concat([trn_data, tst_data], ignore_index=True)

In [5]:
hist_click = all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']

In [6]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else: 
    val_user_item_feats_df_din_model = None
    
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()

In [7]:
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

if offline:
    val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
    val_user_item_feats_df_din_model = None

tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')

In [8]:
trn_user_item_feats_df_din_model

Unnamed: 0,user_id,click_article_id,sim0,time_diff0,word_diff0,sim_max,sim_min,sim_sum,sim_mean,score,...,click_region,click_referrer_type,user_time_hob1,user_time_hob2,words_hbo,category_id,created_at_ts,words_count,is_cat_hab,hist_click_article_id
0,0,3244,,,,,,0.000000,,0.375211,...,25,2,0.343715,0.992865,266.000000,1,1508220059000,153,0,"[30760, 157507]"
1,5,8603,,,,,,0.000000,,,...,25,2,0.343598,0.992789,226.000000,6,1491484758000,164,0,"[211442, 234481]"
2,6,8017,,,,,,0.000000,,0.078042,...,2,1,0.343844,0.991332,188.500000,6,1515513946000,240,0,"[62464, 10023]"
3,6,10243,,,,,,0.000000,,0.173251,...,2,1,0.343844,0.991332,188.500000,6,1507990586000,233,0,"[62464, 10023]"
4,18,8609,,,,,,0.000000,,0.000000,...,13,2,0.343523,0.992784,197.500000,6,1506106322000,174,0,"[70986, 224730]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48378,199958,8609,,,,,,0.000000,,0.000000,...,10,1,0.228723,0.990938,220.562500,6,1506106322000,174,0,"[70755, 111809, 158915, 293432, 36399, 70214, ..."
48379,199961,1330,,,,,,0.000000,,0.000000,...,20,1,0.191252,0.991104,209.250000,1,1471301725000,187,0,"[64329, 199198, 265993, 313431, 199197, 214129..."
48380,199975,598,0.449289,8.110598e+10,55.0,0.449289,0.449289,0.449289,0.449289,0.000000,...,25,2,0.178436,0.990914,197.736842,1,1426443309000,143,0,"[336476, 272143, 79454, 175213, 119193, 119046..."
48381,199988,8710,,,,,,0.000000,,,...,28,2,0.019358,0.989220,251.500000,6,1519057619000,164,0,"[156624, 160974]"


### 5.2 DIN 模型介绍

我们下面尝试使用 DIN 模型，DIN 的全称是 Deep Interest Network，这是阿里 2018 年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型，它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性，来计算用户兴趣的表示向量。

具体来说就是通过<font color=red>引入局部激活单元，通过软搜索历史行为的相关部分来关注相关的用户兴趣，并采用加权和来获得有关候选广告的用户兴趣的表示</font>。与候选广告相关性较高的行为会获得较高的激活权重，并支配着用户兴趣。

该表示向量在不同广告上有所不同，大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合，我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。

该模型的结构如下：

<img src="../image/din.png">

我们这里直接调包来使用这个模型，关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用。

deepctr 的函数原型如下：

> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False, dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice", att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, task='binary'):

- `dnn_feature_columns`：特征列，包含数据所有特征的列表
- `history_feature_list`：用户历史行为列，反应用户历史行为的特征的列表
- `dnn_use_bn`：是否使用 BatchNormalization
- `dnn_hidden_units`：全连接层网络的层数和每一层神经元的个数， 一个列表或者元组
- `dnn_activation_relu`：全连接网络的激活单元类型
- `att_hidden_size`：注意力层的全连接网络的层数和每一层神经元的个数
- `att_activation`：注意力层的激活单元类型
- `att_weight_normalization`：是否归一化注意力得分
- `l2_reg_dnn`：全连接网络的正则化系数
- `l2_reg_embedding`：embedding向量的正则化稀疏
- `dnn_dropout`：全连接网络的神经元的失活概率
- `task`：任务，可以是分类，也可是是回归

在具体使用的时候，我们必须要传入特征列和历史行为列，但是再传入之前，我们需要进行一下特征列的预处理。具体如下：

首先，我们要处理数据集得到数据，由于我们是基于用户过去的行为去预测用户是否点击当前文章，所以我们需要把数据的特征列划分成数值型特征、离散型特征和历史行为特征列三部分，对于每一部分，DIN 模型的处理会有不同。

#### 5.2.1 离散型特征

在我们的数据集中就是那些类别型的特征，比如 user_id 这种，这种类别型特征，我们首先要经过 Embedding 处理得到每个特征的低维稠密型表示，既然要经过 Embedding，那么我们就需要为每一列的类别特征的取值建立一个字典，并指明 Embedding 维度，所以在使用 deepctr 的 DIN 模型准备数据的时候，我们需要通过 SparseFeat 函数指明这些类别型特征，这个函数的传入参数就是列名，列的唯一取值（建立字典用）和 Embedding 维度。

#### 5.2.2 用户历史行为特征

比如文章 id，文章的类别等这种，同样的我们需要先经过 Embedding 处理。只不过和上面不一样的地方是，对于这种特征，我们在得到每个特征的 Embedding 表示之后，还需要通过一个 Attention_layer 计算用户的历史行为和当前候选文章的相关性以此得到当前用户的 Embedding 向量。

这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣，并且随着用户的不同的历史点击来变化，去动态的模拟用户兴趣的变化过程。

这类特征对于每个用户都是一个历史行为序列，对于每个用户，历史行为序列长度会不一样，可能有的用户点击的历史文章多，有的点击的历史文章少，所以我们还需要把这个长度统一起来。

在为 DIN 模型准备数据的时候，我们首先要通过 SparseFeat 函数指明这些类别型特征，然后还需要通过 VarLenSparseFeat 函数再进行序列填充，使得每个用户的历史序列一样长，所以这个函数参数中会有个 maxlen，来指明序列的最大长度是多少。

#### 5.2.3 连续型特征列

对于连续型特征列，只需要用 DenseFeat 函数来指明列名和维度即可。

处理完特征列之后，我们把相应的数据与列进行对应，就得到了最后的数据。

下面根据具体的代码感受一下，逻辑是这样，首先我们需要写一个数据准备函数，在这里面就是根据上面的具体步骤准备数据，得到数据和特征列，然后就是建立 DIN 模型并训练，最后基于模型进行测试。

---

## 5.3 导包

In [9]:
# 导入deepctr
from deepctr.models import DIN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import * 
from tensorflow.keras import backend as K

from sklearn.preprocessing import MinMaxScaler

try:
    from tensorflow.python.distribute import input_lib
    if not hasattr(input_lib, 'DistributedDatasetInterface'):
        if hasattr(input_lib, 'DistributedDataset'):
            input_lib.DistributedDatasetInterface = input_lib.DistributedDataset
        else:
            class DistributedDatasetInterface:
                pass
            input_lib.DistributedDatasetInterface = DistributedDatasetInterface
except Exception as e:
    print(f"Warning: Could not fix DistributedDatasetInterface: {e}")

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

2025-12-24 14:47:21.609129: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-24 14:47:21.659900: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-24 14:47:21.659947: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-24 14:47:21.661767: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-24 14:47:21.670609: I tensorflow/core/platform/cpu_feature_guar

In [10]:
def get_din_feats_columns(df: pd.DataFrame, 
                          dense_fea: list, 
                          sparse_fea: list, 
                          behavior_fea: list, 
                          his_behavior_fea: list, 
                          emb_dim: int = 32, 
                          max_len: int = 100) -> tuple:
    """
    数据准备函数，用于构建 Deep Interest Network (DIN) 的输入特征。

    Args:
        df (`pd.DataFrame`): 输入的数据集。
        dense_fea (`list`): 数值型特征列的名称列表。
        sparse_fea (`list`): 离散型特征列的名称列表。
        behavior_fea (`list`): 用户的候选行为特征列名称列表。
        his_behavior_fea (`list`): 用户的历史行为特征列名称列表。
        emb_dim (`int`, optional): embedding 的维度，默认为 32。
        max_len (`int`, optional): 用户序列的最大长度，默认为 100。

    Returns:
        `tuple`: 包含两个元素的元组：
            - `x`: 特征字典，包含模型输入特征。
            - `dnn_feature_columns`: DNN 特征列的列表。
    """
    # 构建离散特征列，添加 vocabulary_size 和 embedding_dim
    sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].max() + 1, embedding_dim=emb_dim) for feat in sparse_fea]

    # 构建密集特征列
    dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_fea]

    # 构建变长离散特征列
    var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].max() + 1,
                                    embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in his_behavior_fea]
    
    # 合并所有特征列
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    
    # 初始化特征字典
    x = {}
    for name in get_feature_names(dnn_feature_columns):
        if name in his_behavior_fea:
            # 处理历史行为序列，进行填充以满足最大长度
            his_list = [l for l in df[name]]
            x[name] = pad_sequences(his_list, maxlen=max_len, padding='post')  # 生成二维数组
        else:
            x[name] = df[name].values  # 直接提取特征列的值

    return x, dnn_feature_columns  # 返回特征字典和特征列列表

In [11]:
# 把特征分开
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', 
              'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']

behavior_fea = ['click_article_id']

hist_behavior_fea = ['hist_click_article_id']

dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
             'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
             'words_hbo','words_count']

In [12]:
trn_user_item_feats_df_din_model['hist_click_article_id']

0                                          [30760, 157507]
1                                         [211442, 234481]
2                                           [62464, 10023]
3                                           [62464, 10023]
4                                          [70986, 224730]
                               ...                        
48378    [70755, 111809, 158915, 293432, 36399, 70214, ...
48379    [64329, 199198, 265993, 313431, 199197, 214129...
48380    [336476, 272143, 79454, 175213, 119193, 119046...
48381                                     [156624, 160974]
48382                                     [224171, 223931]
Name: hist_click_article_id, Length: 48383, dtype: object

In [15]:
# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理
mm = MinMaxScaler()

# 下面是做一些特殊处理，当在其他的地方出现无效值的时候，不处理无法进行归一化，刚开始可以先把他注释掉
# 在运行了下面的代码之后如果发现报错，应该先去想办法处理如何不出现inf之类的值
trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)

for feat in dense_fea:
    trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
    
    if val_user_item_feats_df_din_model is not None:
        val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
    
    tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])

In [42]:
import numpy as np
import torch

import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

from funrec.inputs import DenseFeat, SparseFeat, VarLenSparseFeat, get_feature_names
from funrec.models import DIN

# 配置参数
emb_dim = 8
max_len = 50

def build_feature_columns(sparse_fea, dense_fea, hist_behavior_fea, emb_dim, vocab_map):
    feature_columns = []

    # 稀疏特征
    for feat in sparse_fea:
        vocab_size = max(vocab_map[feat].values()) + 1 if vocab_map.get(feat) else 1
        feature_columns.append(
            SparseFeat(feat, vocabulary_size=vocab_size, embedding_dim=emb_dim)
        )

    # 稠密特征
    for feat in dense_fea:
        feature_columns.append(
            DenseFeat(feat, 1)
        )

    # 变长稀疏特征
    for hist_feat in hist_behavior_fea:
        emb_feat = 'click_article_id'
        vocab_size = max(vocab_map[emb_feat].values()) + 1 if vocab_map.get(emb_feat) else 1
        feature_columns.append(
            VarLenSparseFeat(
                SparseFeat(hist_feat, vocabulary_size=vocab_size, embedding_dim=emb_dim, embedding_name=emb_feat),
                maxlen=max_len,
                combiner="mean",
                length_name=f"{hist_feat}_len"
            )
        )
    return feature_columns

def map_ids(df, feature, mapping):
    return df[feature].astype('int64').map(mapping).fillna(0).astype('int32').values

def map_seq(seq_list, mapping):
    mapped_seq = []
    seq_lens = []
    for seq in seq_list:
        if isinstance(seq, (list, np.ndarray)):
            mapped = [mapping.get(int(x), 0) if int(x) != 0 else 0 for x in seq]
            mapped_seq.append(mapped)
            seq_lens.append(min(len(mapped), max_len))
        else:
            mapped_seq.append([0])
            seq_lens.append(0)
    padded_seq = pad_sequences(mapped_seq, maxlen=max_len, padding='post').astype('int32')
    return padded_seq, np.array(seq_lens, dtype='int32')

def build_vocab_map(dfs, sparse_fea):
    vocab_map = {}
    for feat in sparse_fea:
        vals = pd.concat([df[feat].astype('int64') for df in dfs], ignore_index=True)
        if feat == 'click_article_id':
            seq_vals = []
            for df in dfs:
                if 'hist_click_article_id' in df.columns:
                    for seq in df['hist_click_article_id']:
                        if isinstance(seq, (list, np.ndarray)):
                            seq_vals.extend([int(x) for x in seq])
            if len(seq_vals) > 0:
                vals = pd.concat([vals, pd.Series(seq_vals, dtype='int64')], ignore_index=True)
        _, uniques = pd.factorize(vals, sort=False)
        mapping = {int(val): int(idx) for idx, val in enumerate(uniques)}
        # 保留0 for padding for 'click_article_id'
        if feat == 'click_article_id':
            mapping = {k: v + 1 for k, v in mapping.items()}
        vocab_map[feat] = mapping
    return vocab_map

def build_model_input(df, feature_columns, vocab_map):
    model_input = {}
    for feat in get_feature_names(feature_columns):
        if feat in sparse_fea:
            model_input[feat] = map_ids(df, feat, vocab_map[feat])
        elif feat in dense_fea:
            vals = df[feat].astype('float32').values
            # 处理 NaN 和 inf 值
            # 对于 NaN，使用该特征的均值填充，如果全为 NaN 则填充 0
            if np.isnan(vals).any():
                fill_value = np.nanmean(vals) if not np.isnan(np.nanmean(vals)) else 0.0
                vals = np.nan_to_num(vals, nan=fill_value, posinf=1.0, neginf=0.0)
            else:
                # 即使没有 NaN，也要处理 inf
                vals = np.nan_to_num(vals, nan=0.0, posinf=1.0, neginf=0.0)
            # 确保值在合理范围内（归一化后应该在 [0, 1]）
            vals = np.clip(vals, 0.0, 1.0)
            model_input[feat] = vals
        elif feat == 'hist_click_article_id':
            seqs, seq_lens = map_seq(df[feat].values, vocab_map['click_article_id'])
            model_input[feat] = seqs
            model_input[f"{feat}_len"] = seq_lens
    return model_input

def get_xy_fd_real():
    # 使用实际的train/test DataFrame
    df_trn = trn_user_item_feats_df_din_model
    df_tst = tst_user_item_feats_df_din_model

    df_val = val_user_item_feats_df_din_model if 'val_user_item_feats_df_din_model' in globals() and val_user_item_feats_df_din_model is not None else None

    dfs = [df_trn, df_tst] + ([df_val] if df_val is not None else [])
    vocab_map = build_vocab_map(dfs, sparse_fea)
    feature_columns = build_feature_columns(sparse_fea, dense_fea, hist_behavior_fea, emb_dim, vocab_map)
    behavior_feature_list = ['click_article_id']

    # train
    x = build_model_input(df_trn, feature_columns, vocab_map)
    y = df_trn['label'].values.astype('float32')

    # test
    x_test = build_model_input(df_tst, feature_columns, vocab_map)
    y_test = None  # 测试集无真实label

    return x, y, feature_columns, behavior_feature_list, x_test

In [54]:
x, y, feature_columns, behavior_feature_list, x_test = get_xy_fd_real()

device = "cpu"
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print("cuda ready...")
    device = "cuda:0"

model = DIN(
    feature_columns,
    behavior_feature_list,
    device=device,
    att_weight_normalization=True,
)

import torch.nn as nn

class BCEWithLogitsLossWrapper(nn.Module):
    """包装 BCEWithLogitsLoss，忽略调用时的 reduction 参数"""
    def __init__(self, reduction="sum"):
        super().__init__()
        self.loss_fn = nn.BCEWithLogitsLoss(reduction=reduction)
    
    def forward(self, input, target, **kwargs):
        return self.loss_fn(input, target)

loss_fn = BCEWithLogitsLossWrapper(reduction="sum")
loss_fn = BCEWithLogitsLossWrapper(reduction="sum")

# 使用 Adam 优化器，学习率设为 0.001
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.compile(optimizer, loss_fn, metrics=["bce_with_logits"])

In [48]:
x

{'user_id': array([    0,     1,     2, ..., 44667, 44668, 44669], dtype=int32),
 'click_article_id': array([ 1,  2,  3, ..., 29,  8,  2], dtype=int32),
 'category_id': array([0, 1, 1, ..., 0, 1, 1], dtype=int32),
 'click_environment': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'click_deviceGroup': array([0, 1, 2, ..., 2, 2, 0], dtype=int32),
 'click_os': array([0, 1, 1, ..., 1, 3, 0], dtype=int32),
 'click_country': array([0, 0, 0, ..., 0, 4, 0], dtype=int32),
 'click_region': array([ 0,  0,  1, ...,  0, 15,  9], dtype=int32),
 'click_referrer_type': array([0, 0, 1, ..., 0, 0, 1], dtype=int32),
 'is_cat_hab': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'sim0': array([0.7439238 , 0.7439238 , 0.7439238 , ..., 0.58110964, 0.7439238 ,
        0.7439238 ], dtype=float32),
 'time_diff0': array([0.12182731, 0.12182731, 0.12182731, ..., 0.5445253 , 0.12182731,
        0.12182731], dtype=float32),
 'word_diff0': array([0.02569613, 0.02569613, 0.02569613, ..., 0.03647215, 0.02569613,
     

In [55]:
history = model.fit(x, y, batch_size=128, epochs=20, verbose=2, validation_split=0.2)

2025-12-24 15:13:38.930 |[1mINFO    [0m| funrec.models.base.base : base: 175 | [36mfunrec[0m | - [1mcpu[0m
2025-12-24 15:13:38.931 |[1mINFO    [0m| funrec.models.base.base : base: 195 | [36mfunrec[0m | - [1mTrain on 38706 samples, validate on 9677 samples, 303 steps per epoch[0m


2025-12-24 15:13:45.729 |[1mINFO    [0m| funrec.models.base.base : base: 265 | [36mfunrec[0m | - [1mEpoch 1/20[0m
2025-12-24 15:13:45.730 |[32m[1mSUCCESS [0m| funrec.models.base.base : base: 282 | [36mfunrec[0m | - [32m[1m6s - loss:  0.7110[0m
2025-12-24 15:13:52.478 |[1mINFO    [0m| funrec.models.base.base : base: 265 | [36mfunrec[0m | - [1mEpoch 2/20[0m
2025-12-24 15:13:52.480 |[32m[1mSUCCESS [0m| funrec.models.base.base : base: 282 | [36mfunrec[0m | - [32m[1m6s - loss:  0.6932[0m
2025-12-24 15:13:59.205 |[1mINFO    [0m| funrec.models.base.base : base: 265 | [36mfunrec[0m | - [1mEpoch 3/20[0m
2025-12-24 15:13:59.206 |[32m[1mSUCCESS [0m| funrec.models.base.base : base: 282 | [36mfunrec[0m | - [32m[1m6s - loss:  0.6932[0m
2025-12-24 15:14:06.219 |[1mINFO    [0m| funrec.models.base.base : base: 265 | [36mfunrec[0m | - [1mEpoch 4/20[0m
2025-12-24 15:14:06.220 |[32m[1mSUCCESS [0m| funrec.models.base.base : base: 282 | [36mfunrec[0m | - 

In [56]:
# 预测
pred_score = model.predict(x_test, batch_size=256)
tst_user_item_feats_df_din_model['pred_score'] = pred_score
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(f"{rank_path}/din_rank_score.csv", index=False)
rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]

In [60]:
# 预测结果重新排序, 及生成提交结果
submit(rank_results, topk=5, model_name='din')

---

## 6 模型融合

### 6.1 加权融合

In [None]:
# 读取多个模型的排序结果文件
lgb_ranker = pd.read_csv(rank_path + 'lgb_ranker_score.csv')
lgb_cls = pd.read_csv(rank_path + 'lgb_cls_score.csv')
din_ranker = pd.read_csv(rank_path + 'din_rank_score.csv')

In [62]:
rank_model = {'lgb_ranker': lgb_ranker, 
              'lgb_cls': lgb_cls,
              'din_ranker': din_ranker}

In [71]:
def get_ensumble_predict_topk(rank_model, topk=5):
    # final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
    final_recall = pd.concat([rank_model['lgb_cls'], rank_model['din_ranker']]).reset_index(drop=True)
    rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))

    # final_recall = final_recall.append(rank_model['lgb_ranker'])
    final_recall = pd.concat([final_recall, rank_model['lgb_ranker']]).reset_index(drop=True)
    final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()

    submit(final_recall, topk=topk, model_name='ensemble_fuse')

In [72]:
get_ensumble_predict_topk(rank_model)

### 6.2 Stacking

- 读取多个模型的交叉验证生成的结果文件

In [None]:
# 训练集
trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')
trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')
trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')

# 测试集
tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')

In [24]:
# 将多个模型输出的特征进行拼接
finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]
finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]

for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_trn_ranker_feats[col_name] = trn_model[feat]

for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats]):
    for feat in [ 'pred_score', 'pred_rank']:
        col_name = feat + '_' + str(idx)
        finall_tst_ranker_feats[col_name] = tst_model[feat]

In [25]:
# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测
# 这里需要注意的是，在做交叉验证的时候可以构造多一些与输出预测值相关的特征，来丰富这里简单模型的特征
from sklearn.linear_model import LogisticRegression

feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1']

trn_x = finall_trn_ranker_feats[feat_cols]
trn_y = finall_trn_ranker_feats['label']

tst_x = finall_tst_ranker_feats[feat_cols]

# 定义模型
lr = LogisticRegression()

# 模型训练
lr.fit(trn_x, trn_y)

# 模型预测
finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]

In [28]:
# 预测结果重新排序, 及生成提交结果
rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='ensumble_staking')