# 1. Library and Data

In [6]:
import pandas as pd  
import numpy as np
from tqdm import tqdm  
from collections import defaultdict  
import os, math, warnings, math, pickle
from tqdm import tqdm
import faiss
import collections
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
# from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
# from tensorflow.python.keras import backend as K
# from tensorflow.python.keras.models import Model
# from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# from deepmatch.models import *
# from deepmatch.utils import sampledsoftmaxloss
warnings.filterwarnings('ignore')

In [7]:
# train
train_click = pd.read_csv("data/train_click_log.csv")
articles = pd.read_csv("data/articles.csv")
articles = articles.rename(columns={'article_id': 'click_article_id'})  #重命名，方便后续match
articles_emb = pd.read_csv("data/articles_emb.csv")

# test
test_click = pd.read_csv("data/testA_click_log.csv")

train_click = train_click.append(test_click)
train_click = train_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))

train_click = train_click.sort_values('user_id')
train_click.reset_index(drop=True, inplace=True)
train_click.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,157507,1508211702520,4,1,17,1,25,2
1,0,30760,1508211672520,4,1,17,1,25,2
2,1,63746,1508211346889,4,1,17,1,25,6
3,1,289197,1508211316889,4,1,17,1,25,6
4,2,168401,1508211468695,4,3,20,1,25,2


## 1.1 Item Basic Info

In [10]:
item_info_df = articles
item_info_df.head()

Unnamed: 0,click_article_id,category_id,created_at_ts,words_count
0,0,0,1513144419000,168
1,1,1,1405341936000,189
2,2,1,1408667706000,250
3,3,1,1408468313000,230
4,4,1,1407071171000,162


## 1.2 Item Embedding

In [11]:
item_emb_df = articles_emb
item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])

item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)  # norm

item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))
pickle.dump(item_emb_dict, open('item_content_emb.pkl', 'wb'))

## 1.3 Time Preprocess

In [12]:
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))

all_click_df = train_click

# 对时间戳进行归一化,用于在关联规则的时候计算权重
all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)

In [7]:
all_click_df.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,157507,0.343719,4,1,17,1,25,2
1,0,30760,0.343711,4,1,17,1,25,2
2,1,63746,0.343622,4,1,17,1,25,6
3,1,289197,0.343613,4,1,17,1,25,6
4,2,168401,0.343655,4,3,20,1,25,2


# 2. Pre-processing Data Form

## 2.1 User -> Item Time

In [15]:
# 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}

all_click_df = all_click_df.sort_values('click_timestamp')
    
def make_item_time_pair(df):
    return list(zip(df['click_article_id'], df['click_timestamp']))
    
user_item_time_df = all_click_df.groupby('user_id')['click_article_id', 'click_timestamp'].\
                    apply(lambda x: make_item_time_pair(x)).\
                    reset_index().rename(columns={0: 'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
user_item_time_dict[0]

[(30760, 0.34371106006481994), (157507, 0.3437192918623605)]

## 2.2 Item -> User Time

In [16]:
# 根据时间获取商品被点击的用户序列  {item1: [(user1, time1), (user2, time2)...]...}
# 这里的时间是用户点击当前商品的时间，好像没有直接的关系。

def make_user_time_pair(df):
    return list(zip(df['user_id'], df['click_timestamp']))
    
all_click_df = all_click_df.sort_values('click_timestamp')
item_user_time_df = all_click_df.groupby('click_article_id')['user_id', 'click_timestamp'].\
                    apply(lambda x: make_user_time_pair(x)).\
                    reset_index().rename(columns={0: 'user_time_list'})
item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))
item_user_time_dict[30760][:5]

[(78586, 0.33306617583201864),
 (190856, 0.33921553493856127),
 (175736, 0.3393807301028193),
 (156136, 0.33938393282084905),
 (20405, 0.33952911730846563)]

## 2.3 Last Click

In [17]:
all_click_df = all_click_df.sort_values(by=['user_id', 'click_timestamp'])
click_last_df = all_click_df.groupby('user_id').tail(1)

    # 如果用户只有一个点击，hist为空了，会导致训练的时候这个用户不可见，此时默认泄露一下
def hist_func(user_df):
    if len(user_df) == 1:
        return user_df
    else:
        return user_df[:-1]

click_hist_df = all_click_df.groupby('user_id').apply(hist_func).reset_index(drop=True)

## 2.4 Article ID -> category_id, created_at_ts, words_count

In [18]:
# 获取文章id对应的基本属性，保存成字典的形式，方便后面召回阶段，冷启动阶段直接使用

max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))

item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)
    
item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))
item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))
item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))

In [19]:
item_info_df.head()

Unnamed: 0,click_article_id,category_id,created_at_ts,words_count
0,0,0,0.978432,168
1,1,1,0.680295,189
2,2,1,0.689493,250
3,3,1,0.688942,230
4,4,1,0.685078,162


## 2.5 Topk Click Articles

In [71]:
k = 50
topk_click = all_click_df['click_article_id'].value_counts().index[:k]
topk_click

Int64Index([272143, 234698, 123909, 336221,  96210, 336223, 183176, 168623,
            162655, 331116,  64329, 199198, 235616, 336245, 160974, 124749,
            336220, 233717, 289090, 156560, 293301, 123757,  20691,  59057,
             95716, 199197, 129434, 235854, 233688,  31836, 160417, 158536,
            124748, 123289, 159762, 124177, 119193, 234269, 336254, 111043,
            289003, 235870, 166581, 293513, 288320, 300082, 293114, 271262,
            235230, 133160],
           dtype='int64')

## 2.5 User History

In [41]:
def get_user_hist_item_info_dict(all_click):
    
    # 获取user_id对应的用户历史点击文章类型的集合字典
    user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()
    user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))
    
    # 获取user_id对应的用户点击文章的集合
    user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()
    user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))
    
    # 获取user_id对应的用户历史点击的文章的平均字数字典
    user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()
    user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))
    
    # 获取user_id对应的用户最后一次点击的文章的创建时间
    all_click_ = all_click.sort_values('click_timestamp')
    user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()
    
    max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
    user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)
    
    user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
                                                user_last_item_created_time['created_at_ts']))
    
    return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict


# 3. Recall Dict

In [21]:
user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},
                           'cold_start_recall': {}}

# 4. Models

## 4.1 Item-based CF

In [22]:
user_item_time_dict[0]

[(30760, 0.34371106006481994), (157507, 0.3437192918623605)]

$$
\text{location-weight} (i,j) = \alpha (i,j) \cdot 0.9^{|i - j|- 1} \\
\alpha (i,j) = \left\{
\begin{aligned}
& 1    & i>j \\
& 0.7  & \text{else}
\end{aligned}
\right.
$$

$$
\text{clicked-time-weight} (i,j) = \exp (0.7 ^{|i - j|})
$$

$$
\text{created-time-weight}(i,j) = \exp (0.8^{|i - j|})
$$

In [26]:
# 计算物品相似度
i2i_sim = {}
item_cnt = defaultdict(int)

for user, item_time_list in tqdm(user_item_time_dict.items()):  # user: user_id; 
    # 在基于商品的协同过滤优化的时候可以考虑时间因素
    for loc1, (i, i_click_time) in enumerate(item_time_list):  # item-to-item
        item_cnt[i] += 1
        i2i_sim.setdefault(i, {})
        for loc2, (j, j_click_time) in enumerate(item_time_list):
            if(i == j):
                continue
                    
            # 考虑文章的正向顺序点击和反向顺序点击    
            loc_alpha = 1.0 if loc2 > loc1 else 0.7
            # 位置信息权重，其中的参数可以调节
            loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))
            # 点击时间权重，其中的参数可以调节
            click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))
            # 两篇文章创建时间的权重，其中的参数可以调节
            created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))
            i2i_sim[i].setdefault(j, 0)
            # 考虑多种因素的权重计算最终的文章之间的相似度
            i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)
                
i2i_sim_ = i2i_sim.copy()
for i, related_items in i2i_sim.items():
    for j, wij in related_items.items():
        i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
# 将得到的相似性矩阵保存到本地
pickle.dump(i2i_sim_, open('./hybird_model/itemcf_i2i_sim.pkl', 'wb'))
i2i_sim = i2i_sim_

100%|██████████| 250000/250000 [07:35<00:00, 548.35it/s] 


## 4.2 Item Embedding Sim

In [24]:
item_emb_df.head()

Unnamed: 0,article_id,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_240,emb_241,emb_242,emb_243,emb_244,emb_245,emb_246,emb_247,emb_248,emb_249
0,0,-0.161183,-0.957233,-0.137944,0.050855,0.830055,0.901365,-0.335148,-0.559561,-0.500603,...,0.321248,0.313999,0.636412,0.169179,0.540524,-0.813182,0.28687,-0.231686,0.597416,0.409623
1,1,-0.523216,-0.974058,0.738608,0.155234,0.626294,0.485297,-0.715657,-0.897996,-0.359747,...,-0.487843,0.823124,0.412688,-0.338654,0.320786,0.588643,-0.594137,0.182828,0.39709,-0.834364
2,2,-0.619619,-0.97296,-0.20736,-0.128861,0.044748,-0.387535,-0.730477,-0.066126,-0.754899,...,0.454756,0.473184,0.377866,-0.863887,-0.383365,0.137721,-0.810877,-0.44758,0.805932,-0.285284
3,3,-0.740843,-0.975749,0.391698,0.641738,-0.268645,0.191745,-0.825593,-0.710591,-0.040099,...,0.271535,0.03604,0.480029,-0.763173,0.022627,0.565165,-0.910286,-0.537838,0.243541,-0.885329
4,4,-0.279052,-0.972315,0.685374,0.113056,0.238315,0.271913,-0.568816,0.341194,-0.600554,...,0.238286,0.809268,0.427521,-0.615932,-0.503697,0.61445,-0.91776,-0.424061,0.185484,-0.580292


In [25]:
all_click_df.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
1,0,30760,0.343711,4,1,17,1,25,2
0,0,157507,0.343719,4,1,17,1,25,2
3,1,289197,0.343613,4,1,17,1,25,6
2,1,63746,0.343622,4,1,17,1,25,6
5,2,36162,0.343647,4,3,20,1,25,2


In [28]:
#  文章索引与文章id的字典映射
item_idx_2_rawid_dict = dict(zip(item_emb_df.index, item_emb_df['article_id']))

In [31]:
topk = 10

item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols].values, dtype=np.float32)

# 向量进行单位化
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
    
# 建立faiss索引
item_index = faiss.IndexFlatIP(item_emb_np.shape[1])  # use the inner product
item_index.add(item_emb_np)

# 相似度查询，给每个索引位置上的向量返回topk个item以及相似度
sim, idx = item_index.search(item_emb_np, topk) # 返回的是列表

In [35]:
sim.shape, idx.shape

((364047, 10), (364047, 10))

In [32]:
# 将向量检索的结果保存成原始id的对应关系
item_sim_dict = collections.defaultdict(dict)

for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(item_emb_np)), sim, idx)):
    
    target_raw_id = item_idx_2_rawid_dict[target_idx]  # 获取article的id
    
    # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1
    
    for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]):  # rele_idx: sim article id
        rele_raw_id = item_idx_2_rawid_dict[rele_idx]
        item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value
    
# 保存i2i相似度矩阵
pickle.dump(item_sim_dict, open('./hybird_model/emb_i2i_sim.pkl', 'wb'))   

364047it [00:29, 12253.20it/s]


In [46]:
item_sim_dict[0]

{77608: 0.8903070688247681,
 77965: 0.8881016969680786,
 77610: 0.8859948515892029,
 77974: 0.8822700381278992,
 83769: 0.8771183490753174,
 77383: 0.8734965920448303,
 77296: 0.8730965852737427,
 77283: 0.8729653358459473,
 77978: 0.8696216940879822}

In [47]:
emb_i2i_sim = item_sim_dict

## 4.3 User CF Sim

In [54]:
u2u_sim = pickle.load(open('usercf_u2u_sim.pkl', 'rb'))

# 5. Recall

In [51]:
# 获取用户历史交互的文章
user_hist_items = user_item_time_dict[0]
user_hist_items_ = {user_id for user_id, _ in user_hist_items}
type(user_hist_items_)

set

## 5.1 Item-based CF Recall

In [36]:
# 基于商品的召回i2i

def item_based_recommend(user_id, 
                         user_item_time_dict=user_item_time_dict, 
                         i2i_sim=i2i_sim, 
                         sim_item_topk=20, 
                         recall_item_num=10, 
                         item_topk_click=topk_click, 
                         item_created_time_dict=item_created_time_dict, 
                         emb_i2i_sim=emb_i2i_sim):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
        :param i2i_sim: 字典，文章相似性矩阵
        :param sim_item_topk: 整数， 选择与当前文章最相似的前k篇文章
        :param recall_item_num: 整数， 最后的召回文章数量
        :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全
        :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵
        
        return: 召回的文章列表 [(item1, score1), (item2, score2)...]
    """
    # 获取用户历史交互的文章
    user_hist_items = user_item_time_dict[user_id]
    user_hist_items_ = {user_id for user_id, _ in user_hist_items}
    
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_hist_items):  # i: article id
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:  # j: sim article id
            if j in user_hist_items_:
                continue
            
            # 文章创建时间差权重
            created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))
            # 相似文章和历史点击文章序列中历史文章所在的位置权重
            loc_weight = (0.9 ** (len(user_hist_items) - loc))
            
            content_weight = 1.0
            if emb_i2i_sim.get(i, {}).get(j, None) is not None:
                content_weight += emb_i2i_sim[i][j]
            if emb_i2i_sim.get(j, {}).get(i, None) is not None:
                content_weight += emb_i2i_sim[j][i]
                
            item_rank.setdefault(j, 0)
            item_rank[j] += created_time_weight * loc_weight * content_weight * wij
    
    # 不足10个，用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

In [37]:
# 获取测试集
test_click = pd.read_csv("data/testA_click_log.csv")
test_users = test_click['user_id'].unique()

user_recall_items_dict = collections.defaultdict(dict)

for user in tqdm(test_users):
    user_recall_items_dict[user] = item_based_recommend(user)

100%|██████████| 50000/50000 [1:03:29<00:00, 13.12it/s]


In [38]:
pickle.dump(user_recall_items_dict, open('./hybird_model/itemcf_recall_dict.pkl', 'wb'))

## 5.2 User-based CF Recall

In [61]:
def user_based_recommend(user_id, 
                         user_item_time_dict=user_item_time_dict, 
                         u2u_sim=u2u_sim, 
                         sim_user_topk=20, 
                         recall_item_num=10, 
                         item_topk_click=topk_click, 
                         item_created_time_dict=item_created_time_dict, 
                         emb_i2i_sim=emb_i2i_sim):
    """
        基于文章协同过滤的召回
        :param user_id: 用户id
        :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列   {user1: [(item1, time1), (item2, time2)..]...}
        :param u2u_sim: 字典，文章相似性矩阵
        :param sim_user_topk: 整数， 选择与当前用户最相似的前k个用户
        :param recall_item_num: 整数， 最后的召回文章数量
        :param item_topk_click: 列表，点击次数最多的文章列表，用户召回补全
        :param item_created_time_dict: 文章创建时间列表
        :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵
        
        return: 召回的文章列表 [(item1, score1), (item2, score2)...]
    """
    # 历史交互
    user_item_time_list = user_item_time_dict[user_id]    #  [(item1, time1), (item2, time2)..]
    user_hist_items = set([i for i, t in user_item_time_list])   # 存在一个用户与某篇文章的多次交互， 这里得去重
    
    items_rank = {}
    for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:  # sim user
        for i, click_time in user_item_time_dict[sim_u]:  # i: sim user -> article id
            if i in user_hist_items:
                continue
            items_rank.setdefault(i, 0)
            
            loc_weight = 1.0
            content_weight = 1.0
            created_time_weight = 1.0
            
            # 当前文章与该用户看的历史文章进行一个权重交互
            for loc, (j, click_time) in enumerate(user_item_time_list):
                # 点击时的相对位置权重
                loc_weight += 0.9 ** (len(user_item_time_list) - loc)
                # 内容相似性权重
                if emb_i2i_sim.get(i, {}).get(j, None) is not None:
                    content_weight += emb_i2i_sim[i][j]
                if emb_i2i_sim.get(j, {}).get(i, None) is not None:
                    content_weight += emb_i2i_sim[j][i]
                
                # 创建时间差权重
                created_time_weight += np.exp(0.8 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))
                
            items_rank[i] += loc_weight * content_weight * created_time_weight * wuv
        
    # 热度补全
    if len(items_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in items_rank.items(): # 填充的item应该不在原来的列表中
                continue
            items_rank[item] = - i - 100 # 随便给个复数就行
            if len(items_rank) == recall_item_num:
                break
        
    items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]    
    
    return items_rank

In [None]:
# 获取测试集
test_click = pd.read_csv("data/testA_click_log.csv")
test_users = test_click['user_id'].unique()

user_recall_items_dict = collections.defaultdict(dict)

for user in tqdm(test_users):
    user_recall_items_dict[user] = user_based_recommend(user)

 98%|█████████▊| 48924/50000 [4:08:08<03:37,  4.95it/s]   

In [None]:
pickle.dump(user_recall_items_dict, open('./hybird_model/usercf_u2u2i_recall.pkl', 'wb'))

# 6. Cold Start

In [39]:
test_click = pd.read_csv("data/testA_click_log.csv")
test_users = test_click['user_id'].unique()

user_recall_items_dict = collections.defaultdict(dict)

for user in tqdm(test_users):
    user_recall_items_dict[user] = item_based_recommend(user, sim_item_topk=150, recall_item_num=100)
    
pickle.dump(user_recall_items_dict, open('./hybird_model/cold_start_items_raw_dict.pkl', 'wb'))

100%|██████████| 50000/50000 [46:32<00:00, 17.91it/s]  


In [98]:
user_recall_items_dict = pickle.load(open('./hybird_model/cold_start_items_raw_dict.pkl', 'rb'))

In [42]:
# History Data

def get_user_hist_item_info_dict(all_click):
    
    # 获取user_id对应的用户历史点击文章类型的集合字典
    user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()
    user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))
    
    # 获取user_id对应的用户点击文章的集合
    user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()
    user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))
    
    # 获取user_id对应的用户历史点击的文章的平均字数字典
    user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()
    user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))
    
    # 获取user_id对应的用户最后一次点击的文章的创建时间
    all_click_ = all_click.sort_values('click_timestamp')
    user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()
    
    max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
    user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)
    
    user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
                                                user_last_item_created_time['created_at_ts']))
    
    return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict


all_click_df_ = all_click_df.copy()
all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')

user_hist_item_typs_dict,\
user_hist_item_ids_dict,\
user_hist_item_words_dict,\
user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)

In [50]:
def get_click_article_ids_set(all_click_df):
    return set(all_click_df.click_article_id.values)

click_article_ids_set = get_click_article_ids_set(all_click_df)

In [81]:
hist_last_item_created_time = datetime.fromtimestamp(user_last_item_created_time_dict[240000])
hist_last_item_created_time

datetime.datetime(1970, 1, 1, 0, 0, 0, 989429)

In [None]:
def get_click_article_ids_set(all_click_df):
    return set(all_click_df.click_article_id.values)

In [None]:
item_type_dict 
item_words_dict 
item_created_time_dict 

In [99]:
cold_start_user_items_dict = {}    
        
def cold_start_items(user_recall_items_dict, 
                     user_hist_item_typs_dict=user_hist_item_typs_dict, 
                     user_hist_item_words_dict=user_hist_item_words_dict,
                     user_last_item_created_time_dict=user_last_item_created_time_dict, 
                     item_type_dict=item_type_dict, 
                     item_words_dict=item_words_dict, 
                     item_created_time_dict=item_created_time_dict, 
                     click_article_ids_set=click_article_ids_set, 
                     recall_item_num=10):
    """
        冷启动的情况下召回一些文章
        :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章， 字典， {user1: [(item1, item2), ..], }
        :param user_hist_item_typs_dict: 字典， 用户点击的文章的主题映射
        :param user_hist_item_words_dict: 字典， 用户点击的历史文章的字数映射
        :param user_last_item_created_time_idct: 字典，用户点击的历史文章创建时间映射
        :param item_tpye_idct: 字典，文章主题映射
        :param item_words_dict: 字典，文章字数映射
        :param item_created_time_dict: 字典， 文章创建时间映射
        :param click_article_ids_set: 集合，用户点击过得文章, 也就是日志里面出现过的文章
        :param recall_item_num: 召回文章的数量， 这个指的是没有出现在日志里面的文章数量
    """
    
    cold_start_user_items_dict = {}
    for user, item_list in tqdm(user_recall_items_dict.items()):  # recomend user id, 100 个 articles item_list
        cold_start_user_items_dict.setdefault(user, [])
        for item, score in item_list:
            # 获取历史文章信息
            hist_item_type_set = user_hist_item_typs_dict[user]
            hist_mean_words = user_hist_item_words_dict[user]
            hist_last_item_created_time = user_last_item_created_time_dict[user]
            hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)
            
            # 获取当前召回文章的信息
            curr_item_type = item_type_dict[item]
            curr_item_words = item_words_dict[item]
            curr_item_created_time = item_created_time_dict[item]
            curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)

            # 首先，文章不能出现在用户的历史点击中， 然后根据文章主题，文章单词数，文章创建时间进行筛选
            if curr_item_type not in hist_item_type_set or \
                abs(curr_item_words - hist_mean_words) > 400 or \
                abs((curr_item_created_time - hist_last_item_created_time).days) > 100: 
                continue
                
            cold_start_user_items_dict[user].append((item, score))      # {user1: [(item1, score1), (item2, score2)..]...}
    
    # 需要控制一下冷启动召回的数量
    cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \
                                  for k, v in cold_start_user_items_dict.items()}
    
    # pickle.dump(cold_start_user_items_dict, open(save_path + 'cold_start_user_items_dict.pkl', 'wb'))
    
    return cold_start_user_items_dict

In [100]:
# user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict

# 需要注意的是
# 这里使用了很多规则来筛选冷启动的文章，所以前面再召回的阶段就应该尽可能的多召回一些文章，否则很容易被删掉
cold_start_user_items_dict = cold_start_items(user_recall_items_dict)

100%|██████████| 50000/50000 [00:16<00:00, 3086.56it/s]


In [103]:
pickle.dump(cold_start_user_items_dict, open('./hybird_model/cold_start_u2i_recall.pkl', 'wb'))