# 1. Load the Data

In [12]:
import pandas as pd

# train
train_click = pd.read_csv("data/train_click_log.csv")
articles = pd.read_csv("data/articles.csv")
articles = articles.rename(columns={'article_id': 'click_article_id'})  #重命名，方便后续match
articles_emb = pd.read_csv("data/articles_emb.csv")

# test
test_click = pd.read_csv("data/testA_click_log.csv")

train_click = train_click.append(test_click)
train_click = train_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))

train_click = train_click.sort_values('user_id')
train_click.reset_index(drop=True, inplace=True)
train_click.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,0,157507,1508211702520,4,1,17,1,25,2
1,0,30760,1508211672520,4,1,17,1,25,2
2,1,63746,1508211346889,4,1,17,1,25,6
3,1,289197,1508211316889,4,1,17,1,25,6
4,2,168401,1508211468695,4,3,20,1,25,2


In [9]:
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

## 1.1 User -> item and time

In [10]:
def make_item_time_pair(df):
    return list(zip(df['click_article_id'], df['click_timestamp']))

train_click = train_click.sort_values('click_timestamp')

user_item_time_df = train_click.groupby('user_id')['click_article_id', 'click_timestamp']\
                    .apply(lambda x: make_item_time_pair(x))\
                    .reset_index().rename(columns={0: 'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))

In [11]:
user_item_time_dict[0]

[(30760, 1508211672520), (157507, 1508211702520)]

## 1.2 Item -> user and time

In [38]:
# 根据时间获取商品被点击的用户序列  {item1: [(user1, time1), (user2, time2)...]...}

def make_user_time_pair(df):
    return list(zip(df['user_id'], df['click_timestamp']))
    
train_click = train_click.sort_values('click_timestamp')
item_user_time_df = train_click.groupby('click_article_id')['user_id', 'click_timestamp'].\
                    apply(lambda x: make_user_time_pair(x))\
                    .reset_index().rename(columns={0: 'user_time_list'})
    
item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))

In [42]:
item_user_time_dict[3]

[(76266, 1507572378399)]

# 2. User Active Degree

In [32]:
train_click_ = train_click.groupby('user_id')['click_article_id'].\
                                    count().reset_index().\
                                    rename(columns={"click_article_id":"click_count"})
train_click_.head()

Unnamed: 0,user_id,click_count
0,0,2
1,1,2
2,2,2
3,3,2
4,4,2


In [35]:
# 获取用户点击的item数目并进行归一化,表示其活跃程度

from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
train_click_['click_count'] = mm.fit_transform(train_click_[['click_count']])
user_activate_degree_dict = dict(zip(train_click_['user_id'], train_click_['click_count']))

In [46]:
user_activate_degree_dict[0]

0.0010672358591248667

# 3. User-to-User Sim

In [48]:
u2u_sim = {}
user_cnt = defaultdict(int)

for item, user_time_list in tqdm(item_user_time_dict.items()):  # itme:itme id
    for u, click_time in user_time_list:  # u: user id
        user_cnt[u] += 1
        u2u_sim.setdefault(u, {})
        for v, click_time in user_time_list:
            u2u_sim[u].setdefault(v, 0)
            if u == v:
                continue
            # 用户平均活跃度作为活跃度的权重，这里的式子也可以改善
            activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v])
            u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)

100%|██████████| 35380/35380 [5:33:15<00:00,  1.77it/s]    


In [49]:
u2u_sim_ = u2u_sim.copy()
for u, related_users in u2u_sim.items():
    for v, wij in related_users.items():
        u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])
        
pickle.dump(u2u_sim_, open('usercf_u2u_sim.pkl', 'wb'))

In [50]:
u2u_sim = u2u_sim_

# 4. Recommend

In [51]:
# 获取近期点击最多的文章
k = 10
topk_click = train_click['click_article_id'].value_counts().index[:k]
topk_click

Int64Index([272143, 234698, 123909, 336221, 96210, 336223, 183176, 168623,
            162655, 331116],
           dtype='int64')

In [59]:
# 基于用户的召回 u2u2i
def user_based_recommend(user_id, 
                         user_item_time_dict=user_item_time_dict,
                         u2u_sim=u2u_sim,
                         sim_user_topk=20,
                         recall_item_num=10, 
                         item_topk_click=topk_click):
    
    # 历史交互
    # user_item_time_list = user_item_time_dict[user_id]    #  [(item1, time1), (item2, time2)..]
    # user_hist_items = set([i for i, t in user_item_time_list])   # 存在一个用户与某篇文章的多次交互， 这里得去重
    
    
    current_user_items_time_list = user_item_time_dict[user_id] 
    current_user_items_set = set(article_id for article_id, click_time in current_user_items_time_list)
    
    items_rank = {}
    
    # 相似的user: sum_u
    for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:  # 遍历相似的user
        for i, click_time in user_item_time_dict[sim_u]:  # 遍历相似user的item
            if i in current_user_items_set:
                continue
                
            items_rank.setdefault(i, 0)
            items_rank[i] += wuv
        
    # 热度补全
    if len(items_rank) < recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in items_rank.items(): # 填充的item应该不在原来的列表中
                continue
            items_rank[item] = - i - 100 # 随便给个复数就行
            if len(items_rank) == recall_item_num:
                break
        
    items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]    
    
    return items_rank

In [62]:
user_recall_items_dict = {}

for user in tqdm(test_click['user_id'].unique()):
    user_recall_items_dict[user] = user_based_recommend(user)

100%|██████████| 50000/50000 [12:50<00:00, 64.93it/s] 


# 5. Submit

In [67]:
user_recall_items_dict[249999]

[(123909, 3.760588753744524),
 (336223, 3.469618535224845),
 (64409, 3.3247088678180545),
 (59057, 2.890969016191794),
 (199197, 2.6993398523889263),
 (336220, 2.568960481892443),
 (199198, 2.418379281819452),
 (234698, 2.228184527054652),
 (272143, 2.140644904825349),
 (236444, 2.1221103471709903)]

In [64]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])

100%|██████████| 50000/50000 [00:01<00:00, 31140.48it/s]


In [77]:
recall_df[recall_df["user_id"]==200000]

Unnamed: 0,user_id,click_article_id,pred_score
499990,200000,336221,0.874071
499991,200000,272143,0.842884
499992,200000,235616,0.817024
499993,200000,59681,0.695078
499994,200000,199197,0.675983
499995,200000,286161,0.672047
499996,200000,206168,0.648882
499997,200000,284547,0.613595
499998,200000,162655,0.587366
499999,200000,234269,0.577121


In [68]:
# 生成提交文件
save_path = "./"

def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5篇文章及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 
                                                  3: 'article_3', 4: 'article_4', 5: 'article_5'})
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [70]:
# 获取测试集
test_click = pd.read_csv("data/testA_click_log.csv")
test_click = test_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
test_recall = recall_df[recall_df['user_id'].isin(test_click)]

# 生成提交文件
submit(test_recall, topk=5, model_name='usercf_baseline')