In [4]:
%run utils.ipynb

import pandas as pd
import numpy as np
from tqdm import tqdm
from annoy import AnnoyIndex
import os
import warnings
from collections import defaultdict
import math
import pickle
import multitasking
import signal



multitasking.set_max_threads(10)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [5]:
df_qtime = pd.read_pickle('../user_data/data/qtime.pkl')
df_click = pd.read_pickle('../user_data/data/click.pkl')

In [6]:
phases = sorted(list(df_qtime['phase'].unique()))
phases

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
def euc(v, w):
    #  np.square(x): 计算数组各元素的平方
    if np.max(v) == 0 and np.min(v) == 0 and np.max(w) == 0 and np.min(w) == 0:
        return 10
    return np.sqrt(np.sum(np.square(v - w)))

In [8]:
def cal_item_sim(df, user_col, item_col):
    # 得出每个用户看过的items
    user_item_ = df.groupby(user_col)[item_col].agg(
        lambda x: list(x)).reset_index()
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))

    # 得出每个用户看过的items的时间
    user_time_ = df.groupby(user_col)['time'].agg(
        lambda x: list(x)).reset_index()  # 引入时间因素
    user_time_dict = dict(zip(user_time_[user_col], user_time_['time']))
    
    # 将训练集中每个item的特征保存起来并做一些预处理
    txt_vec_cols = ['txt_vec_{}'.format(i) for i in range(128)]
    img_vec_cols = ['img_vec_{}'.format(i) for i in range(128)]
    vec_df = pd.read_csv(
        '../data/underexpose_train/underexpose_item_feat.csv',
        names=['item_id'] + txt_vec_cols + img_vec_cols)
    # strip()：；去掉字符串头尾（默认为空格）
    vec_df['txt_vec_0'] = vec_df['txt_vec_0'].apply(lambda x: x.strip()[1:])
    vec_df['txt_vec_127'] = vec_df['txt_vec_127'].apply(
        lambda x: x.strip()[:-1])
    vec_df['img_vec_0'] = vec_df['img_vec_0'].apply(lambda x: x.strip()[1:])
    vec_df['img_vec_127'] = vec_df['img_vec_127'].apply(
        lambda x: x.strip()[:-1])
    # 转为浮点数
    vec_df[txt_vec_cols + img_vec_cols] = vec_df[txt_vec_cols +
                                                 img_vec_cols].astype('float')
    # 左侧表数据都在（df:点击数据集） 右侧匹配（每个item的向量）
    df = df.merge(vec_df, on='item_id', how='left')
    # 填充所有Nan为0
    for f in tqdm(txt_vec_cols + img_vec_cols):
        df[f] = df[f].fillna(0)
    # 将click文件中所有item_id和txt向量，img向量对应起来存贮到字典中
    txt_vec_dict = dict(zip(df['item_id'], df[txt_vec_cols].values))
    img_vec_dict = dict(zip(df['item_id'], df[img_vec_cols].values))
    
    # 计算item之间的相似度
    sim_item = {}
    item_cnt = defaultdict(int)
    # 对每个用户的每个点击过的item进行分析
    for user, items in tqdm(user_item_dict.items()):
        # enumerate为从零开始的index
        for loc1, item in enumerate(items):
            #每出现过一次加一
            item_cnt[item] += 1
            # dic.setdefault：如果字典中包含有给定键，则返回该键对应的值，否则返回为该键设置的值。
            sim_item.setdefault(item, {})
            for loc2, relate_item in enumerate(items):
                t1 = user_time_dict[user][loc1]
                t2 = user_time_dict[user][loc2]
                # 除去一个用户看过的item的前后顺序大于5条的 除去是一样item的 出去两个item的时间间隔大于0.00003的 
                if abs(loc2 -
                       loc1) > 5 or item == relate_item or abs(t2 -
                                                               t1) > 0.000003:
                    continue
                
                sim_item[item].setdefault(relate_item, 0)
                
                # 计算余下的符合条件的相似度（两个向量离得越远 euc_factor越小）
                # sqrt：返回平方根
                txt_euc_factor = 1 / \
                    np.sqrt(
                        1 + euc(txt_vec_dict[item], txt_vec_dict[relate_item]))
                img_euc_factor = 1 / \
                    np.sqrt(
                        1 + euc(img_vec_dict[item], img_vec_dict[relate_item]) / 10)
                # 计算相似度
                # （1 * 两个txt的相似度 * 两个img的相似度 * 权重系数1 * 权重系数2 * 
                #  在用户的点击顺序中相差多少个 * （1-这两个item点击时间的差值*10000））
                # /
                # log(这个用户看过多少items + 1)
                if loc1 - loc2 > 0:
                    sim_item[item][
                        relate_item] += 1 * txt_euc_factor * img_euc_factor * 0.7 * (
                            0.8**(loc1 - loc2 -
                                  1)) * (1 - (t1 - t2) * 10000) / math.log(
                                      1 + len(items))  # 逆向
                else:
                    sim_item[item][
                        relate_item] += 1 * txt_euc_factor * img_euc_factor * 1.0 * (
                            0.8**(loc2 - loc1 -
                                  1)) * (1 - (t2 - t1) * 10000) / math.log(
                                      1 + len(items))  # 正向
    # 除以item在数据集中出现的次数 削弱经常出现的item相关系数
    sim_item_corr = sim_item.copy()
    for i, related_items in tqdm(sim_item.items()):
        for j, cij in related_items.items():
            sim_item_corr[i][j] = cij / math.sqrt(item_cnt[i] * item_cnt[j])

    return sim_item_corr, user_item_dict, item_cnt, user_time_dict

In [9]:
def recall(df_qtime, item_sim_list, user_item, item_cnt, user_time_dict):
    data_list = []
    # 列出需要查询的test中的信息
    for user_id, query_time, item_id, phase in tqdm(df_qtime.values):
        rank = {}
        # 此user已经看过的item
        interacted_items = user_item[user_id]
        # 将看过的items反过来 即越往后浏览的越靠前
        interacted_items = interacted_items[::-1]
        for loc, i in enumerate(interacted_items):
            # 计算询问时间 与 点击时间的差值
            time_factor = 1 - 1000 * \
                (query_time - user_time_dict[user_id]
                 [len(interacted_items)-loc-1])
            # 将每个已经看过的item（i）的物品相似度进行排序 按权重进行排序(d[1])
            for j, wij in sorted(item_sim_list[i].items(),
                                 key=lambda d: d[1],
                                 reverse=True)[0:500]:
                # 如果未看过 相似度 * loc代表离的越远越要削弱（平方）* 此item出现的次数 * 询问时间与点击时间的差值
                if j not in interacted_items:
                    rank.setdefault(j, 0)
                    rank[j] += wij * (0.7**loc) * item_cnt[j] * time_factor
        # 取累加和Top 100
        sim_items = sorted(rank.items(), key=lambda d: d[1],
                           reverse=True)[:100]
        item_ids = [item[0] for item in sim_items]
        item_sim_scores = [item[1] for item in sim_items]

        df_temp = pd.DataFrame()
        df_temp['item_id'] = item_ids
        df_temp['sim_score'] = item_sim_scores
        df_temp['user_id'] = user_id
        df_temp['query_time'] = query_time
        df_temp['phase'] = phase
            
        # 如果是测试集中需要预测的 标记为nan
        # 否则的话标记为0 并且如果df_temp中存在此id的话 将标签改为1
        if item_id == -1:
            df_temp['label'] = np.nan
        else:
            df_temp['label'] = 0
            df_temp.loc[df_temp['item_id'] == item_id, 'label'] = 1

        df_temp.sort_values(['sim_score'], inplace=True, ascending=False)
        df_temp = df_temp[[
            'user_id', 'phase', 'query_time', 'item_id', 'sim_score', 'label'
        ]]
        df_temp['user_id'] = df_temp['user_id'].astype('int')
        df_temp['item_id'] = df_temp['item_id'].astype('int')

        data_list.append(df_temp)

    df_data = pd.concat(data_list, sort=False)

    return df_data

In [10]:
# 多线程运算
@multitasking.task
def work(phase, force=False):
    # 创建目录
    os.makedirs('../user_data/model/recall_v1', exist_ok=True)
    if force or (
            not os.path.exists(
                '../user_data/model/recall_v1/sim_{}.pkl'.format(phase))
            or not os.path.exists(
                '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))):
        # 获取当前阶段的click
        df_click_phase = df_click[df_click['phase'] == phase]
        item_sim, user_item, item_cnt, user_time_dict = cal_item_sim(
            df_click_phase, 'user_id', 'item_id')

        f = open('../user_data/model/recall_v1/sim_{}.pkl'.format(phase), 'wb')
        # 将对象item_sim保存到文件f中去。
        pickle.dump(item_sim, f)
        f.close()

        # 获取当前阶段的qtime, 召回
        df_qtime_phase = df_qtime[df_qtime['phase'] == phase]
        df_data = recall(df_qtime_phase, item_sim, user_item, item_cnt,
                         user_time_dict)
        df_data.to_pickle(
            '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))

        print('phase {} finish'.format(phase))

In [11]:
item_sim_phase = {}
df_recall = pd.DataFrame()
val_score = np.array([0.0, 0.0, 0.0, 0.0])
force = False

for phase in phases:
    work(phase, force)

multitasking.wait_for_tasks()
print('合并任务')

for phase in phases:
    f = open('../user_data/model/recall_v1/sim_{}.pkl'.format(phase), 'rb')
    item_sim = pickle.load(f)
    f.close()

    df_data = pd.read_pickle(
        '../user_data/model/recall_v1/recall_{}.pkl'.format(phase))

    item_sim_phase[phase] = item_sim
    df_recall = df_recall.append(df_data)

    score = evaluate_scores(df_data, phase)
    val_score += score

    print('phase', phase, score)

100%|██████████| 256/256 [00:00<00:00, 451.71it/s]
  0%|          | 0/256 [00:00<?, ?it/s]452.28it/s]
100%|██████████| 256/256 [00:00<00:00, 429.66it/s]
100%|██████████| 256/256 [00:00<00:00, 367.79it/s]
100%|██████████| 256/256 [00:00<00:00, 400.47it/s]
100%|██████████| 256/256 [00:00<00:00, 381.52it/s]
100%|██████████| 256/256 [00:00<00:00, 363.55it/s]
100%|██████████| 256/256 [00:00<00:00, 354.14it/s]
100%|██████████| 256/256 [00:00<00:00, 346.84it/s]
100%|██████████| 256/256 [00:00<00:00, 325.10it/s]
100%|██████████| 18505/18505 [00:43<00:00, 428.13it/s]
100%|██████████| 40768/40768 [00:00<00:00, 68347.95it/s]
100%|██████████| 18398/18398 [00:43<00:00, 419.90it/s]
100%|██████████| 41024/41024 [00:00<00:00, 66075.62it/s]
100%|██████████| 18672/18672 [00:45<00:00, 413.81it/s]
100%|██████████| 41400/41400 [00:00<00:00, 69854.22it/s]
100%|██████████| 20047/20047 [00:46<00:00, 431.26it/s]
100%|██████████| 44355/44355 [00:00<00:00, 68459.65it/s]
100%|██████████| 18821/18821 [00:48<00:00,

phase 0 finish


 86%|████████▌ | 16717/19459 [02:04<00:18, 148.85it/s]

phase 2 finish


100%|██████████| 20047/20047 [02:14<00:00, 148.91it/s]
 95%|█████████▌| 18937/19883 [02:12<00:06, 149.63it/s]

phase 1 finish


100%|██████████| 18821/18821 [02:15<00:00, 138.58it/s]
100%|██████████| 19883/19883 [02:18<00:00, 143.44it/s]
100%|██████████| 19801/19801 [02:19<00:00, 142.30it/s]
100%|██████████| 18618/18618 [02:14<00:00, 138.09it/s]
 95%|█████████▌| 18515/19459 [02:16<00:05, 160.35it/s]

phase 9 finish


 97%|█████████▋| 18962/19459 [02:19<00:03, 163.76it/s]

phase 3 finish


100%|██████████| 19459/19459 [02:22<00:00, 136.27it/s]
 91%|█████████ | 18555/20396 [02:18<00:13, 135.75it/s]

phase 8 finish


 92%|█████████▏| 18770/20396 [02:19<00:09, 177.76it/s]

phase 7 finish


 94%|█████████▎| 19084/20396 [02:21<00:07, 175.01it/s]

phase 4 finish


 98%|█████████▊| 20064/20396 [02:27<00:01, 180.12it/s]

phase 5 finish


100%|██████████| 20396/20396 [02:29<00:00, 136.79it/s]


phase 6 finish
合并任务


100%|██████████| 18504/18504 [00:12<00:00, 1528.94it/s]


phase 0 (0.052688683624598324, 0.118156988481178, 0.024452030143499365, 0.06921754084264832)


100%|██████████| 18670/18670 [00:12<00:00, 1531.25it/s]


phase 1 (0.053108878016302555, 0.12103151186120618, 0.022319277474255158, 0.06613946800862688)


100%|██████████| 18396/18396 [00:12<00:00, 1524.14it/s]


phase 2 (0.053747003629359275, 0.12347378501316734, 0.021545213866758853, 0.06701183431952663)


100%|██████████| 18820/18820 [00:12<00:00, 1523.29it/s]


phase 3 (0.053213183533173766, 0.1263268400793188, 0.021339148442584596, 0.06913724915775597)


100%|██████████| 18617/18617 [00:12<00:00, 1519.63it/s]


phase 4 (0.057097395613414696, 0.13258426966292136, 0.022283770772319057, 0.06908197727970525)


100%|██████████| 19458/19458 [00:13<00:00, 1485.30it/s]


phase 5 (0.05681835535421251, 0.1318724874016194, 0.024099469917594683, 0.07245155855096883)


100%|██████████| 20392/20392 [00:13<00:00, 1525.91it/s]


phase 6 (0.05816232617571826, 0.13394347240915208, 0.026341899211875738, 0.07743076016499706)


100%|██████████| 19800/19800 [00:13<00:00, 1467.60it/s]


phase 7 (0.04693263221319276, 0.11141968451455232, 0.021338058168548314, 0.06417704011065006)


100%|██████████| 19880/19880 [00:13<00:00, 1521.67it/s]


phase 8 (0.04569840261620969, 0.10772211458621644, 0.021860698481693683, 0.06657496561210453)


100%|██████████| 20044/20044 [00:12<00:00, 1542.91it/s]

phase 9 (0.04900353889215489, 0.11139655643618475, 0.024014846973565637, 0.07035309793471019)





In [12]:
# 保存相似度字典给后续使用
f = open('../user_data/model/if_sim.pkl', 'wb')
pickle.dump(item_sim_phase, f)
f.close()

In [13]:
val_score

array([0.5264704 , 1.21792771, 0.22959441, 0.69157549])

In [14]:
df_recall.sort_values(['user_id', 'phase', 'query_time'], inplace=True)
df_recall.to_pickle('../user_data/data/recall_v1.pkl')
df_recall.head(15)

Unnamed: 0,user_id,phase,query_time,item_id,sim_score,label
0,1,0.0,0.9839419315,92349,0.0789179728,0.0
1,1,0.0,0.9839419315,87837,0.0672007507,0.0
2,1,0.0,0.9839419315,38168,0.0592249611,0.0
3,1,0.0,0.9839419315,91290,0.0493173711,0.0
4,1,0.0,0.9839419315,13663,0.0435710306,0.0


In [86]:
len(df_recall[df_recall['user_id']==1])

963