# 科大讯飞短视频推荐比赛

In [1]:
# 导入包
import pandas as pd
import numpy  as np
import random
from tqdm.notebook import tqdm
import math
from collections import defaultdict
random.seed(2024)

## 数据分析

In [25]:
uid_click_log = pd.read_csv('/Users/xmly/Desktop/科大讯飞短视频推荐/uid_click_log.csv')
uid_click_log

Unnamed: 0,uid,vid,cid,playtime,duration,date,rank
0,100000,100870,100037,4966,9590,20200706,1
1,100000,101167,100024,5037,4949,20200706,2
2,100000,103608,100008,5137,14884,20200706,3
3,100000,100220,100084,5139,16784,20200706,4
4,100000,101674,100027,5149,5760,20200706,5
...,...,...,...,...,...,...,...
3487097,105999,102918,100019,124488,116467,20200810,1
3487098,105999,100917,100019,88168,120500,20200827,1
3487099,105999,100208,100079,49005,57506,20200828,1
3487100,105999,106468,100027,26886,8067,20200904,1


In [26]:
uid_click_log.uid.nunique()

6000

In [27]:
uid_click_log.vid.nunique()

9992

In [3]:
# 每天每个用户点击历史长度
uid_date_cnt = uid_click_log.groupby(['date','uid'])['rank'].nunique().reset_index()
uid_date_cnt

Unnamed: 0,date,uid,rank
0,20200705,100001,8
1,20200705,100002,3
2,20200705,100003,9
3,20200705,100004,3
4,20200705,100005,20
...,...,...,...
161842,20200904,105994,4
161843,20200904,105995,11
161844,20200904,105996,6
161845,20200904,105998,6


In [4]:
last_uid = uid_date_cnt[(uid_date_cnt['date'] == 20200904)&(uid_date_cnt['rank']>5)] #最后一天中点击数大于5的用户
last_uid #这些uid都在过去有点击数据

Unnamed: 0,date,uid,rank
156649,20200904,100000,26
156650,20200904,100001,8
156651,20200904,100002,8
156652,20200904,100003,18
156655,20200904,100007,7
...,...,...,...
161836,20200904,105988,6
161839,20200904,105991,6
161843,20200904,105995,11
161844,20200904,105996,6


In [5]:
# 每个用户累计的历史长度
# 不存在用户只有一次点击情况
uid_cnt = uid_click_log.groupby(['uid'])['vid'].nunique().reset_index().rename(columns={'vid':'vid_cnt'})
uid_cnt

Unnamed: 0,uid,vid_cnt
0,100000,849
1,100001,621
2,100002,1009
3,100003,780
4,100004,421
...,...,...
5995,105995,194
5996,105996,76
5997,105997,62
5998,105998,44


## offline数据

In [6]:
uid_click_log.sort_values(by = ['date','rank'], ascending = [False,True])

Unnamed: 0,uid,vid,cid,playtime,duration,date,rank
847,100000,101412,100026,9870,18267,20200904,1
1502,100001,103854,100054,11821,48149,20200904,1
2512,100002,102088,100247,9426,9728,20200904,1
3313,100003,100930,100017,7422,16000,20200904,1
3834,100004,100112,100017,6790,7245,20200904,1
...,...,...,...,...,...,...,...
2488076,103957,103051,100302,5134,6315,20200705,39
217650,100326,102797,100019,9865,85600,20200705,40
884593,101345,100729,100011,5838,12032,20200705,40
217651,100326,104793,100000,9761,11755,20200705,41


In [7]:
uid_click_log[uid_click_log['date'] == 20200904].groupby(['uid']).head(5)

Unnamed: 0,uid,vid,cid,playtime,duration,date,rank
847,100000,101412,100026,9870,18267,20200904,1
848,100000,108026,100009,10131,11467,20200904,2
849,100000,102864,100017,10139,10200,20200904,3
850,100000,103024,100221,10119,9009,20200904,4
851,100000,100321,100099,10165,7500,20200904,5
...,...,...,...,...,...,...,...
3487069,105998,105920,100045,1230,8055,20200904,3
3487070,105998,104338,100338,7369,8428,20200904,4
3487071,105998,103597,100027,2044,5850,20200904,5
3487100,105999,106468,100027,26886,8067,20200904,1


In [8]:
def online_offline_split(click):
    """
    划分线下验证模型效果offline数据集
    """
    online_click = click.copy() #线上数据是全量
    offline_df =  click[click['date'] == 20200904] #最后一天的数据
    offline_cnt = offline_df.groupby(['uid'])['rank'].nunique().reset_index().rename(columns = {'rank':'cnt'}) #统计每个uid点击频次
    offline_cnt = offline_cnt[offline_cnt['cnt'] > 5] #当前uid有超过5次点击，这样才可以设置答案
    uid_list = offline_cnt.uid.unique().tolist() #20200904uid列表
    offline_uid = random.sample(uid_list, 1000) #从20200904 uid中随机抽取1000个作为线下测试
    print(f'20200904 total uid : {len(uid_list)}| offline sample uid : {len(offline_uid)}')
    offline_click = click[(click['date']!= 20200904)] #offline训练数据只要丢掉20200904那一天的数据就好了
    # offline_click = offline_click.sort_values(by = ['date','rank'], ascending = [False,True]) #根据日期降序排列
    offline_answer = offline_df[offline_df['uid'].isin(offline_uid)] #设置答案
    offline_answer = offline_answer.sort_values(by=['uid','rank'], ascending = [True, True]) #根据uid和rank升序
    offline_answer = offline_answer.groupby(['uid']).head(5) #前5个作为答案
    offline_click.to_csv('offline_click.csv',index = False)
    offline_answer.to_csv('offline_answer.csv', index = False)
    return online_click, offline_click, offline_answer

In [10]:
online_click, offline_click, offline_answer = online_offline_split(uid_click_log)

20200904 total uid : 2867| offline sample uid : 1000


## Recall
- v1：由于用户历史序列过长，因此recall只考虑用户最近50次点击

In [2]:
offline_click = pd.read_csv('offline_click.csv')
offline_click #线下验证的数据集

Unnamed: 0,uid,vid,cid,playtime,duration,date,rank
0,100000,100870,100037,4966,9590,20200706,1
1,100000,101167,100024,5037,4949,20200706,2
2,100000,103608,100008,5137,14884,20200706,3
3,100000,100220,100084,5139,16784,20200706,4
4,100000,101674,100027,5149,5760,20200706,5
...,...,...,...,...,...,...,...
3439322,105999,100070,100039,13146,123758,20200809,1
3439323,105999,108337,100036,12579,10567,20200809,2
3439324,105999,102918,100019,124488,116467,20200810,1
3439325,105999,100917,100019,88168,120500,20200827,1


In [3]:
def reset_user_time(click):
    """
    重置click数据中的rank，将date维度消去，只保留用户点击顺序
    """
    click = click.sort_values(by=['uid', 'date', 'rank'], ascending=[True, False, True]) #根据date降序，rank升序
    click['click_rank'] = click.groupby('uid').cumcount() + 1
    click = click.drop(columns = ['date','rank'])
    click = click.sort_values(by = ['uid','click_rank']).reset_index(drop = True)
    return click

In [15]:
def make_item_rank_tuple(group, item_col, rank_col):
    """
    每一个user的{item:rank}字典
    """
    item_rank_tuples = list(zip(group[item_col], group[rank_col]))
    return item_rank_tuples

def make_user_rank_tuple(group_df, user_col, rank_col):
    user_time_tuples = list(zip(group_df[user_col], group_df[rank_col]))
    return user_time_tuples

def get_user_item_rank_dict(click, user_col = 'uid', item_col = 'vid', 
                            rank_col = 'click_rank'):
    """
    得到{user:{item:rank}}的字典
    v1 : 对每一个uid取top50的点击
    """
    click = click.sort_values(by = [user_col, rank_col])
    click = click.groupby(['uid']).head(50)
    user_item_rank = click.groupby(user_col).apply(lambda group : make_item_rank_tuple(group, item_col, rank_col)).reset_index().rename(columns={0: 'item_rank_list'})
    user_item_rank_dict = dict(zip(user_item_rank[user_col], user_item_rank['item_rank_list']))
    return user_item_rank_dict

def get_item_user_rank_dict(df, user_col='uid', item_col='vid', time_col='click_rank'):
    item_user_df = df.sort_values(by=[item_col, time_col])
    item_user_df = item_user_df.groupby(item_col).apply(
        lambda group: make_user_rank_tuple(group, user_col, time_col)).reset_index().rename(
        columns={0: 'user_id_time_list'})
    item_user_time_dict = dict(zip(item_user_df[item_col], item_user_df['user_id_time_list']))
    return item_user_time_dict

In [5]:
click = reset_user_time(offline_click)
click

Unnamed: 0,uid,vid,cid,playtime,duration,click_rank
0,100000,105757,100009,10157,3012,1
1,100000,100884,100008,10197,15232,2
2,100000,102207,100144,9866,28855,3
3,100000,107069,100000,9895,17655,4
4,100000,100815,100027,9756,5394,5
...,...,...,...,...,...,...
3439322,105999,105952,100009,12243,6505,23
3439323,105999,100855,100009,2205,29568,24
3439324,105999,100906,100154,33124,6100,25
3439325,105999,103053,100027,8082,6073,26


### 基于Item2Vec得到item的embedding

In [16]:
from gensim.models import Word2Vec
def Item2Vec(click,i2v_dim):
    """
    基于Item2Vec算法得到Item的Embedding结果
    """
    print('-----Item2Vec data-----')
    click['vid'] = click['vid'].apply(str)
    click_list = click.groupby(['uid'])['vid'].apply(list).values #用于Word2Vec训练的数据
    print('-----Training Item2Vec-----')
    model = Word2Vec(click_list, vector_size=i2v_dim, window=5, min_count=0, workers=40, sg=0, hs=1)
    print('-----Get Item Embedding-----')
    vocab_list = [(k, model.wv[k]) for k in model.wv.key_to_index]
    item2vec_item_embed_dict = dict(vocab_list)
    return item2vec_item_embed_dict

In [17]:
item2vec_item_embed_dict = Item2Vec(click, 10)

-----Item2Vec data-----
-----Training Item2Vec-----
-----Get Item Embedding-----


### 基于Item的Embedding计算内容相似度

In [6]:
import os
import pickle
import faiss
from collections import defaultdict  
def get_item_content_simi(item_embed_dict, topk=300, is_load_from_file=True):
    """
    基于Item的Embedding计算内容相似度
    topk = 200 因为每个召回渠道召回200个item
    """
    dir = '/Users/xmly/Desktop/科大讯飞短视频推荐/'
    sim_path = os.path.join(dir, 'item_content_sim_dict.pkl')

    if is_load_from_file and os.path.exists(sim_path):
        with open(sim_path, 'rb') as f:
            return pickle.load(f)
    print('begin compute similarity using faiss...')
    # 将字典转换为 DataFrame
    item_ids = list(item_embed_dict.keys())
    item_feat_np = np.array(list(item_embed_dict.values()), dtype=np.float32)
    
    # Create mapping from index to item_id
    item_idx_2_rawid_dict = {idx: item_id for idx, item_id in enumerate(item_ids)}

    # norm
    item_feat_np = item_feat_np / np.linalg.norm(item_feat_np, axis=1, keepdims=True)

    # Use FAISS to index and search
    index = faiss.IndexFlatIP(item_feat_np.shape[1])
    index.add(item_feat_np)
    
    item_sim_dict = defaultdict(dict)
    def search(feat_index, feat_np):
        sim, idx = feat_index.search(feat_np, topk)
        for target_idx, sim_value_list, rele_idx_list in zip(range(len(feat_np)), sim, idx):
            target_raw_id = item_idx_2_rawid_dict[target_idx]
            for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]):
                rele_raw_id = item_idx_2_rawid_dict[rele_idx]
                item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value

    search(index, item_feat_np)
    with open(sim_path, 'wb') as f:
        pickle.dump(item_sim_dict, f)
    return item_sim_dict

In [8]:
item_sim_dict = get_item_content_simi(item_embed_dict=None)

### ItemCF召回

In [9]:
def ItemCF2sim(df, user_col='uid', item_col='vid', rank_col='click_rank'):
    user_item_time_dict = get_user_item_rank_dict(df, user_col, item_col, rank_col)
    sim_item = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        for loc_1, (i, i_time) in enumerate(item_time_list):
            item_cnt[i] += 1
            sim_item.setdefault(i, {})
            for loc_2, (relate_item, related_time) in enumerate(item_time_list):
                if i == relate_item:
                    continue
                loc_alpha = 1.0 if loc_2 > loc_1 else 0.7
                loc_weight = loc_alpha * (0.8**(np.abs(loc_2-loc_1)-1)) 
                time_weight = np.exp(-15000*np.abs(i_time-related_time))
                sim_item[i].setdefault(relate_item, 0)
                sim_item[i][relate_item] += loc_weight * time_weight / math.log(1 + len(item_time_list))        
    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])
            # sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i]*item_cnt[j])+cij/min(item_cnt[i], item_cnt[j])+0.5*cij/(item_cnt[i]+item_cnt[j])
    return sim_item_corr, user_item_time_dict

def ItemCF2recommend(sim_item_corr, user_item_time_dict, user_id, top_k, item_num, alpha=15000,
                         item_cnt_dict=None, user_cnt_dict=None):
    """
    ItemCF的召回推荐
    params : 
        sim_item_corr : itemcf计算出来的相似度矩阵
        user_item_time_dict : user-item-time dict
        user_id : 当前召回的userID
        top_k : ItemCF矩阵的TopK
        item_num : 召回数量
        item_cnt_dict : item频次dict
        user_cnt_dict : user频次dict
    return :
        返回基于ItemCF召回的物品集合
    """
    global item_content_sim_dict #item内容相似度矩阵
    rank = {} #itemcf召回dict
    if user_id not in user_item_time_dict:
        return []
    interacted_item_times = user_item_time_dict[user_id] #当前用户交互过的item和时间戳
    min_time = min([time for item, time in interacted_item_times]) #最远一次交互时间
    interacted_items = set([item for item, time in interacted_item_times]) #交互过的item

    miss_item_num = 0 # 交互过的item是否不在itemcf相似度矩阵中
    for loc, (i, time) in enumerate(interacted_item_times):
        if i not in sim_item_corr:
            miss_item_num += 1
            continue
        for j, wij in sorted(sim_item_corr[i].items(), key=lambda x: x[1], reverse=True)[0:top_k]: 
            # 对itemCF相似度矩阵按照相似度排序，并且取出来前topK
            if j not in interacted_items:
                # 当前item不在交互过的item中
                rank.setdefault(j, 0) #用0
                content_weight = 1.0 #相似度权重
                if item_content_sim_dict.get(i, {}).get(j, None) is not None:
                    content_weight += item_content_sim_dict[i][j]
                if item_content_sim_dict.get(j, {}).get(i, None) is not None:
                    content_weight += item_content_sim_dict[j][i]

                time_weight = np.exp(alpha * (time - min_time)) #时间权重
                loc_weight = (0.9 ** (len(interacted_item_times) - loc)) #位置权重
                rank[j] += loc_weight * time_weight * content_weight * wij #召回score
    if miss_item_num > 10:
        print('user_id={}, miss_item_num={}'.format(user_id, miss_item_num))
    # if item_cnt_dict is not None:
    #     # 根据item的频次纠偏
    #     for loc, item in enumerate(rank):
    #         #重新计算相似度
    #         rank[item] = re_rank(rank[item], item, user_id, item_cnt_dict, user_cnt_dict, adjust_type=adjust_type)

    sorted_rank_items = sorted(rank.items(), key=lambda d: d[1], reverse=True)
    return sorted_rank_items[0:item_num]


### UserCF召回

In [18]:
# user-cf
def UserCf2sim(df):
    # user_min_time_dict = get_user_min_time_dict(df, user_col, item_col, time_col) # user first time 
    # history
    user_item_time_dict = get_user_item_rank_dict(df)
    # item, [u1, u2, ...,]
    item_user_time_dict = get_item_user_rank_dict(df)

    sim_user = {}
    user_cnt = defaultdict(int)
    for item, user_time_list in tqdm(item_user_time_dict.items()):
        num_users = len(user_time_list)
        for u, t in user_time_list:
            user_cnt[u] += 1
            sim_user.setdefault(u, {})
            for relate_user, relate_t in user_time_list:
                # time_diff_relate_u = 1.0/(1.0+10000*abs(relate_t-t))
                if u == relate_user:
                    continue
                sim_user[u].setdefault(relate_user, 0)
                weight = 1.0
                sim_user[u][relate_user] += weight / math.log(1 + num_users) # 流行度高的衰减

    sim_user_corr = sim_user.copy()
    for u, related_users in tqdm(sim_user.items()):
        for v, cuv in related_users.items():
            sim_user_corr[u][v] = cuv / math.sqrt(user_cnt[u] * user_cnt[v])

    return sim_user_corr, user_item_time_dict
def UserCF2recommend(sim_user_corr, user_item_time_dict, user_id, top_k, item_num, alpha=15000,
                         item_cnt_dict=None, user_cnt_dict=None):
    global item_content_sim_dict

    rank = {}
    interacted_items = set([i for i, t in user_item_time_dict[user_id]])
    interacted_item_time_list = user_item_time_dict[user_id]
    interacted_num = len(interacted_items)

    min_time = min([t for i, t in interacted_item_time_list])
    time_weight_dict = {i: np.exp(alpha * (t - min_time)) for i, t in interacted_item_time_list}
    loc_weight_dict = {i: 0.9 ** (interacted_num - loc) for loc, (i, t) in enumerate(interacted_item_time_list)}

    for sim_v, wuv in sorted(sim_user_corr[user_id].items(), key=lambda x: x[1], reverse=True)[0:top_k]:
        if sim_v not in user_item_time_dict:
            continue
        for j, j_time in user_item_time_dict[sim_v]:
            if j not in interacted_items:
                rank.setdefault(j, 0)

                content_weight = 1.0
                for loc, (i, t) in enumerate(interacted_item_time_list):
                    loc_weight = loc_weight_dict[i]
                    time_weight = time_weight_dict[i]
                    if item_content_sim_dict.get(i, {}).get(j, None) is not None:
                        content_weight += time_weight * loc_weight * item_content_sim_dict[i][j]

                # weight = np.exp(-15000*abs(j_time-q_time))
                rank[j] += content_weight * wuv

    # if item_cnt_dict is not None:
    #     for loc, item in enumerate(rank):
    #         rank[item] = re_rank(rank[item], item, user_id, item_cnt_dict, user_cnt_dict, adjust_type=adjust_type)

    rec_items = sorted(rank.items(), key=lambda d: d[1], reverse=True)

    return rec_items[:item_num]


### Swing召回

In [17]:
def Swing(df, user_col='uid', item_col='vid', rank_col='click_rank'):
    """
    Swing召回
    """
    # 1. item, (u1,t1), (u2, t2).....
    item_user_df = df.sort_values(by=[item_col, rank_col])
    item_user_df = item_user_df.groupby(item_col).apply(
        lambda group: make_user_rank_tuple(group, user_col, rank_col)).reset_index().rename(
        columns={0: 'user_id_time_list'})
    item_user_time_dict = dict(zip(item_user_df[item_col], item_user_df['user_id_time_list']))

    user_item_time_dict = defaultdict(list)
    # 2. ((u1, u2), i1, d12)
    u_u_cnt = defaultdict(list)
    item_cnt = defaultdict(int)
    for item, user_time_list in tqdm(item_user_time_dict.items()):
        for u, u_time in user_time_list:
            # just record
            item_cnt[item] += 1
            user_item_time_dict[u].append((item, u_time))

            for relate_u, relate_u_time in user_time_list:
                if relate_u == u:
                    continue
               
                key = (u, relate_u)  if u <= relate_u else (relate_u, u)
                u_u_cnt[key].append((item, np.abs(u_time - relate_u_time)))


    # 3. (i1,i2), sim
    sim_item = {}
    alpha = 5.0
    for u_u, co_item_times in u_u_cnt.items():
        num_co_items = len(co_item_times)
        for i, i_time_diff in co_item_times:
            sim_item.setdefault(i, {})
            for j, j_time_diff in co_item_times:
              if j == i:
                continue
              weight = 1.0 # np.exp(-15000*(i_time_diff + j_time_diff)), not effective
              sim_item[i][j] = sim_item[i].setdefault(j, 0.) + weight / (alpha + num_co_items)
    # 4. norm by item count
    sim_item_corr = sim_item.copy()
    for i, related_items in sim_item.items():
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])
       
    return sim_item_corr, user_item_time_dict

### 召回结果

In [16]:
def get_multi_source_sim_dict_results(click, recall_methods={'item-cf', 'user-cf', 'swing'}):
    """
    得到不同召回渠道的相似度矩阵
    """
    recall_sim_pair_dict = {}
    if 'item-cf' in recall_methods:
        print('item-cf item-sim begin')
        item_sim_dict, _ = ItemCF2sim(click)
        recall_sim_pair_dict['item-cf'] = item_sim_dict
        print('item-cf item-sim-pair done, pair_num={}'.format(len(item_sim_dict)))

    if 'swing' in recall_methods:
        print('swing item-sim begin')
        item_sim_dict, _ = Swing(click)
        recall_sim_pair_dict['swing'] = item_sim_dict
        print('swing item-sim-pair done, pair_num={}'.format(len(item_sim_dict)))

    if 'user-cf' in recall_methods:
        print('user-cf user-sim begin')
        user_sim_dict, _ = UserCf2sim(click)
        recall_sim_pair_dict['user-cf'] = user_sim_dict
        print('user-cf user-sim-pair done, pair_num={}'.format(len(user_sim_dict)))

    return recall_sim_pair_dict
def get_recall_results(item_sim_dict, user_item_dict, top50_click,
                       recommend_num, topk_num,
                       if_item = True,
                       target_user_ids=None,
                       item_cnt_dict=None, user_cnt_dict=None):
    """
    得到各召回渠道结果
    item_sim_dict : item相似度矩阵
    user_item_dict : {user:{item:click_rank}}字典
    top50_click : 最高点击top50的item
    recommend_num : 相似度矩阵前recomme_num
    topk_num : 召回数量
    target_user_ids : 目标user
    """
    if target_user_ids is None:
        target_user_ids = user_item_dict.keys()
    recall_item_dict = {}
    for u in tqdm(target_user_ids):
        if if_item:
            recall_items = ItemCF2recommend(item_sim_dict, user_item_dict, u, topk_num,recommend_num,
                                                    item_cnt_dict=item_cnt_dict, user_cnt_dict=user_cnt_dict)
        else:
            recall_items = UserCF2recommend(item_sim_dict, user_item_dict, u, topk_num,recommend_num,
                                                    item_cnt_dict=item_cnt_dict, user_cnt_dict=user_cnt_dict)
        if len(recall_items) == 0:
            recall_items = [(top50_click[0], 0.0)]  # to avoid the lost of the recommendation results for this user
        recall_item_dict[u] = recall_items

    return recall_item_dict

def agg_recall_results(recall_item_dict_list_dict, ret_type='tuple',
                       weight_dict={}):
    """
    汇总召回结果,将各渠道召回结果打乱重排
    """
    print('aggregate recall results begin....')
    agg_recall_item_dict = {}
    for name, recall_item_dict in recall_item_dict_list_dict.items():
        weight = weight_dict.get(name, 1.0)
        print('name={}, weight={}'.format(name, weight))
        for u, recall_items in recall_item_dict.items():
            agg_recall_item_dict.setdefault(u, {})
            for i, score in recall_items:
                agg_recall_item_dict[u].setdefault(i, 0.0)
                agg_recall_item_dict[u][i] += weight * score  # 累加

    if ret_type == 'tuple':
        agg_recall_item_tuple_dict = {}
        for u, recall_item_dict in agg_recall_item_dict.items():
            sorted_recall_item_tuples = sorted(recall_item_dict.items(), key=lambda x: x[1], reverse=True)
            agg_recall_item_tuple_dict[u] = sorted_recall_item_tuples
        return agg_recall_item_tuple_dict

    if ret_type == 'df':
        recall_u_i_score_pair_list = []
        for u, recall_item_dict in agg_recall_item_dict.items():
            for i, score in recall_item_dict.items():
                recall_u_i_score_pair_list.append((u, i, score))
        recall_df = pd.DataFrame.from_records(recall_u_i_score_pair_list, columns=['uid', 'vid', 'sim'])
        return recall_df
    return agg_recall_item_dict

def do_multi_recall_results(recall_sim_pair_dict, user_item_dict,top50_click,
                            recommend_num,topk_num,
                            target_user_ids=None, ret_type='df',
                            item_cnt_dict=None, user_cnt_dict=None):
    """
    最终的执行召回函数
    """
    if target_user_ids is None:
        target_user_ids = user_item_dict.keys()

    recall_item_list_dict = {}
    for name, sim_dict in recall_sim_pair_dict.items():
        # item-based
        if name in {'item-cf', 'bi-graph', 'swing'}:
            recall_item_dict = get_recall_results(item_sim_dict, user_item_dict, top50_click,
                                                recommend_num, topk_num,
                                                if_item = True,
                                                target_user_ids=None,
                                                item_cnt_dict=None, user_cnt_dict=None)
        else:
            recall_item_dict = get_recall_results(item_sim_dict, user_item_dict, top50_click,
                                                recommend_num, topk_num,
                                                if_item =- False,
                                                target_user_ids=None,
                                                item_cnt_dict=None, user_cnt_dict=None)

        print('{} recall done, recall_user_num={}.'.format(name, len(recall_item_dict)))
        recall_item_list_dict[name] = recall_item_dict
    return agg_recall_results(recall_item_list_dict, ret_type=ret_type)

In [20]:
# 执行召回
item_cnt_dict = click.groupby(['vid'])['uid'].count().to_dict()
user_cnt_dict = click.groupby(['uid'])['vid'].count().to_dict()
recall_methods={'item-cf', 'user-cf'}
recall_sim_pair_dict = get_multi_source_sim_dict_results(click, recall_methods)

item-cf item-sim begin


  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/5852 [00:00<?, ?it/s]

item-cf item-sim-pair done, pair_num=5852
user-cf user-sim begin


  0%|          | 0/9749 [00:00<?, ?it/s]

KeyboardInterrupt: 