# 1. Load the Data

In [134]:
import pandas as pd

# train
train_click = pd.read_csv("data/train_click_log.csv")
articles = pd.read_csv("data/articles.csv")
articles = articles.rename(columns={'article_id': 'click_article_id'})  #重命名，方便后续match
articles_emb = pd.read_csv("data/articles_emb.csv")

# test
test_click = pd.read_csv("data/testA_click_log.csv")

train_click = train_click.append(test_click)
train_click = train_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))

train_click = train_click.sort_values('user_id')
train_click.reset_index(drop=True, inplace=True)
train_click.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,157507,1508211702520,4,1,17,1,25,2
1,0,30760,1508211672520,4,1,17,1,25,2
2,1,63746,1508211346889,4,1,17,1,25,6
3,1,289197,1508211316889,4,1,17,1,25,6
4,2,168401,1508211468695,4,3,20,1,25,2


In [130]:
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

# 2. Construct the user_id, article_id, click_time Dict

In [135]:
def make_item_time_pair(df):
    return list(zip(df['click_article_id'], df['click_timestamp']))

train_click = train_click.sort_values('click_timestamp')

user_item_time_df = train_click.groupby('user_id')['click_article_id', 'click_timestamp']\
                    .apply(lambda x: make_item_time_pair(x))\
                    .reset_index().rename(columns={0: 'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))

In [136]:
user_item_time_dict[0]

[(30760, 1508211672520), (157507, 1508211702520)]

In [138]:
train_click[train_click["user_id"]==0][["user_id", "click_article_id", "click_timestamp"]]

Unnamed: 0,user_id,click_article_id,click_timestamp
1,0,30760,1508211672520
0,0,157507,1508211702520


In [139]:
user_item_time_dict[0]

[(30760, 1508211672520), (157507, 1508211702520)]

In [140]:
user_item_time_df.head(1)

Unnamed: 0,user_id,item_time_list
0,0,"[(30760, 1508211672520), (157507, 1508211702520)]"


# 3. Construct the item-to-item Dict

$$
\text{IUF} = \frac{1}{\log (1 + |N(u)|)}
$$

$$
w_{ij} = \frac{\sum_{u \in N(i) \cap N(j)} \frac{1}{\log (1 + |N(u)|)}}{\sqrt{|N(i)||N(j)|}}
$$

关键之处在于同一个用户的item的list进行自遍历，于是得到对应的2个item之间的当前user的一个相似度的值，即对两者之间的相似性做了一定的贡献。我们要遍历所有用户中的出现的两两item，得到其两两之间的贡献的相似性。

In [141]:
i2i_sim = dict()
item_cnt = defaultdict(int)  # the count of a single article click

for user, item_time_list in tqdm(user_item_time_dict.items()):  # user: user id
    for i, i_click_time in item_time_list:  # i: article id 
        item_cnt[i] += 1
        i2i_sim.setdefault(i, {})  # article id 作为key值又对应一个dict: item-to-item
        for j, j_click_time in item_time_list:
            if(i == j):
                continue
            i2i_sim[i].setdefault(j, 0)
                
            i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
            
i2i_sim_ = i2i_sim.copy()
for i, related_items in i2i_sim.items():
    for j, wij in related_items.items():
        i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])

i2i_sim = i2i_sim_

100%|██████████| 250000/250000 [01:08<00:00, 3645.35it/s]


In [142]:
# save the sim

pickle.dump(i2i_sim, open('itemcf_i2i_sim.pkl', 'wb'))

# 去取文章相似度
# i2i_sim = pickle.load(open('itemcf_i2i_sim.pkl', 'rb'))

In [143]:
# article id: 51175
# example

i2i_sim[51175]

{51176: 0.7213475204444817, 51173: 0.7213475204444817}

# 4. Recommend

In [153]:
# 获取近期点击最多的文章
k = 10
topk_click = train_click['click_article_id'].value_counts().index[:k]
topk_click

Int64Index([272143, 234698, 123909, 336221, 96210, 336223, 183176, 168623,
            162655, 331116],
           dtype='int64')

In [154]:
# 基于商品的召回i2i
def item_based_recommend(user_id, 
                         user_item_time_dict=user_item_time_dict, 
                         i2i_sim=i2i_sim, 
                         sim_item_topk=10, 
                         recall_item_num=10, 
                         item_topk_click=topk_click):
    
    # 获取用户历史交互的文章
    current_user_items_time_list = user_item_time_dict[user_id] 
    current_user_items_set = {article_id for article_id, click_time in current_user_items_time_list}
    
    item_rank = {}
    
    # i: article id; click_time: click_time
    for loc, (i, click_time) in enumerate(current_user_items_time_list):  # 遍历一个user的所有item
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:  # 遍历该item的相似高的item
            if j in current_user_items_set:
                continue
                
            item_rank.setdefault(j, 0)  # 如果不存在j键值，才进行初始化，否正直接wij累加
            item_rank[j] +=  wij
    
    # 不足10个，用热门商品补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100 # 随便给个负数就行
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

In [156]:
# 获取测试集
test_click = pd.read_csv("data/testA_click_log.csv")
test_click = test_click['user_id'].unique()

In [157]:
test_click

array([249999, 249998, 249997, ..., 200002, 200001, 200000])

In [158]:
# 只是用需要测试的user
user_recall_items_dict = collections.defaultdict(dict)

for user in tqdm(test_click):
    user_recall_items_dict[user] = item_based_recommend(user)
    
"""
# 使用全部的测试集数据
user_recall_items_dict = collections.defaultdict(dict)

for user in tqdm(train_click['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user)
"""

100%|██████████| 50000/50000 [35:02<00:00, 23.78it/s]  


In [160]:
len(user_recall_items_dict)

50000

In [170]:
user_recall_items_dict[200000]

[(237870, 0.12192998520830386),
 (194619, 0.11159880519743068),
 (194935, 0.10712157572740902),
 (314048, 0.08128665680553591),
 (195773, 0.07370444198466476),
 (187005, 0.07119137524987157),
 (50573, 0.07117996614180719),
 (63344, 0.07117996614180719),
 (255153, 0.06803447407802805),
 (195603, 0.06590026834408487)]

# 5. Submit

In [161]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])

100%|██████████| 50000/50000 [00:01<00:00, 25985.20it/s] 


In [162]:
recall_df

Unnamed: 0,user_id,click_article_id,pred_score
0,249999,234698,0.280279
1,249999,95716,0.245980
2,249999,336223,0.244608
3,249999,160132,0.227058
4,249999,59057,0.205233
...,...,...,...
499995,200000,187005,0.071191
499996,200000,50573,0.071180
499997,200000,63344,0.071180
499998,200000,255153,0.068034


In [165]:
# 生成提交文件
save_path = "./"

def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [166]:
# 获取测试集
test_click = pd.read_csv("data/testA_click_log.csv")
test_click = test_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
test_recall = recall_df[recall_df['user_id'].isin(test_click)]

# 生成提交文件
submit(test_recall, topk=5, model_name='itemcf_baseline')

In [172]:
test_recall

Unnamed: 0,user_id,click_article_id,pred_score
0,249999,234698,0.280279
1,249999,95716,0.245980
2,249999,336223,0.244608
3,249999,160132,0.227058
4,249999,59057,0.205233
...,...,...,...
499995,200000,187005,0.071191
499996,200000,50573,0.071180
499997,200000,63344,0.071180
499998,200000,255153,0.068034
