In [1]:
import os
import math
import signal
import pickle
import random
import numpy as np
import multitasking
import pandas as pd
from tqdm import tqdm
from random import shuffle
from collections import defaultdict

random.seed(2024)

In [2]:
df_click = pd.read_pickle('../data/offline/click.pkl')
df_query = pd.read_pickle('../data/offline/query.pkl')

os.makedirs('../data/sim/offline', exist_ok=True)
sim_pkl_file = '../data/sim/offline/itemcf_sim.pkl'

## 1. 计算物品相似度

In [3]:
# 计算出每个用户点击过的文章ID，并转换为字典
user_item_ = df_click.groupby('user_id')['click_article_id'].agg(lambda x: list(x)).reset_index()
user_item_dict = dict(zip(user_item_['user_id'], user_item_['click_article_id']))
user_item_, len(user_item_dict)

(        user_id                                   click_article_id
 0             0                                    [30760, 157507]
 1             1                                    [289197, 63746]
 2             2                                    [36162, 168401]
 3             3                                     [50644, 36162]
 4             4                                     [42567, 39894]
 ...         ...                                                ...
 249995   249995  [300470, 16129, 160974, 182394, 198659, 272143...
 249996   249996                                           [160974]
 249997   249997  [183665, 181686, 123909, 74719, 124667, 124337...
 249998   249998           [160974, 202557, 237524, 236207, 235105]
 249999   249999  [160974, 160417, 162338, 313431, 233717, 21480...
 
 [250000 rows x 2 columns],
 250000)

In [4]:
"""
通过遍历用户的物品列表来构建一个物品相似度字典 sim_dict，同时统计每个物品出现的次数
"""
item_cnt = defaultdict(int)
sim_dict = {}

# 遍历所有（用户，用户点击过的物品）
for _, items in tqdm(user_item_dict.items()):
    # 遍历该用户点击过的所有物品
    for i, item in enumerate(items):
        # 当前物品item出现次数加1
        item_cnt[item] += 1
        sim_dict.setdefault(item, {})
        
        # 再次遍历该用户点击过的所有物品
        for j, relate_item in enumerate(items):
            # 如果两个物品相同，则跳过
            if item == relate_item:
                continue
            
            sim_dict[item].setdefault(relate_item, 0)
            
            # 位置信息权重
            loc_alpha = 1.0 if j > i else 0.7
            loc_weight = loc_alpha * (0.9**(np.abs(j - i) - 1))
            
            sim_dict[item][relate_item] += loc_weight / math.log(1 + len(items))

100%|██████████| 250000/250000 [01:35<00:00, 2620.29it/s]


In [5]:
for item, relate_items in tqdm(sim_dict.items()):
    for relate_item, cij in relate_items.items():
        sim_dict[item][relate_item] = cij / math.sqrt(item_cnt[item] * item_cnt[relate_item])

100%|██████████| 35380/35380 [00:05<00:00, 6852.44it/s] 


In [10]:
for i, j in sim_dict.items():
    print(type(i), type(j))
    break

<class 'int'> <class 'dict'>


In [7]:
# 保存计算完的物品相似度
f = open(sim_pkl_file, 'wb')
pickle.dump(sim_dict, f)
f.close()

---
## 2. 召回


In [15]:
# 设置多进程
max_threads = multitasking.config['CPU_CORES']
multitasking.set_max_threads(max_threads)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)
max_threads

16

In [21]:
all_users = df_query['user_id'].unique()
shuffle(all_users)
total = len(all_users)
n_len = total // max_threads
total, n_len, df_query.values()

TypeError: 'numpy.ndarray' object is not callable

In [25]:
@multitasking.task
def recall(df_query, item_sim, user_item_dict, worker_id) -> None:
    """
    召回
    :param df_query: 
    :param item_sim: 计算得到的物品相似度
    :param user_item_dict: 每个用户点击过的物品列表
    :param worker_id: 当前工作进程ID
    :return: None
    """
    data_list = []
    for user_id, item_id in tqdm(df_query.values):
        rank = {}
        if user_id not in user_item_dict:
            continue
        
        # 保留当前用户user_id交互过的最新的两个物品
        interacted_items = user_item_dict[user_id]
        interacted_items = interacted_items[::-1][:2]
        
        # 遍历当前用户交互过的所有物品
        for loc, item in enumerate(interacted_items):
            # 对和当前物品item相关的所有物品按照相似度wij从大到小进行排序，然后取出前200个物品
            for relate_item, wij in sorted(item_sim[item].items(), key=lambda d: d[1], reverse=True)[0:200]:
                # 仅对用户未交互过的物品进行推荐
                if relate_item not in interacted_items:
                    # 使用位置权重，位置越靠前，权重越高
                    rank[relate_item] = rank.get(relate_item, 0) + wij * (0.7**loc)
                
        sim_items = sorted(rank.items(), key=lambda d: d[1], reverse=True)
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[i] for item in sim_items]
        
        df_temp = pd.DataFrame()
        df_temp['article_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        
        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['article_id'] == item_id, 'label'] = 1
            
        df_temp = df_temp[['user_id', 'article_id', 'sim_score', 'label']]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['article_id'] = df_temp['article_id'].astype('int')
        
        data_list.append(df_temp)
        
    df_data = pd.concat(data_list, sort=False)
    os.makedirs('../data/tmp/itemcf', exist_ok=True)
    df_data.to_pickle(f'../data/tmp/itemcf/{worker_id}.pkl')

In [26]:
# 开始召回
for i in range(0, total, n_len):
    part_users = all_users[i: i + n_len]
    df_temp = df_query[df_query['user_id'].isin(part_users)]
    recall(df_temp, sim_dict, user_item_dict, i)
    
multitasking.wait_for_tasks()

AttributeError: Can't pickle local object 'task.<locals>._run_via_pool'

In [32]:
df_data = pd.DataFrame()
for path, _, file_list in os.walk('../data/tmp/itemcf'):
    for file_name in file_list:
        df_temp = pd.read_pickle(os.path.join(path, file_name))
        df_data = df_data.append(df_temp)
df_temp, df_data

(         user_id  click_article_id  click_timestamp  click_environment  \
 8              4             42567     1.508212e+12                4.0   
 9              4             39894     1.508212e+12                4.0   
 10             5            211442     1.508211e+12                4.0   
 11             5            234481     1.508211e+12                4.0   
 62            29            202355     1.508211e+12                4.0   
 ...          ...               ...              ...                ...   
 1162538   200084                -1              NaN                NaN   
 1162545   200077                -1              NaN                NaN   
 1162555   200067                -1              NaN                NaN   
 1162558   200064                -1              NaN                NaN   
 1162620   200002                -1              NaN                NaN   
 
          click_deviceGroup  click_os  click_country  click_region  \
 8                      1.0 

In [33]:
# 必须加，对其进行排序
df_data = df_data.sort_values(['user_id', 'sim_score'], ascending=[True, False]).reset_index(drop=True)

# 计算召回指标

KeyError: 'user_id'