##### TagBased

基于标签的简单个性化推荐。
    
    1) 统计用户最常用的标签。
    2) 对每个标签，统计被打过的标签次数最多的物品。
    3) 对每个用户，找到他常用标签，然后将这些标签的最热门物品推荐给用户。

用户对物品i的兴趣度：$p(u, i) = \sum_{b}n_{u,b}n_{b,i}$

$n_{u,b}$是用户u打过标签b的次数，$n_{b,i}$标签b下物品i被点击的次数。

##### TagBasedTFIDF

基于标签的简单个性化推荐存在热门标签权重过大的问题，从而过多的推荐热门商品，降低了推荐结果的新颖性。为了解决这个问题，尝试借鉴TFIDF思想，对公式进行改进。

$p(u, i) = \sum_{b}\frac{n_{u,b}}{log(1+n_{b}^{u})}n_{b,i}$

其中$n_{b}^{u}$记录标签b被多少个用户使用。

##### TagBasedTFIDF++

相比TagBased，同时对热门物品和热门标签进行惩罚。

$p(u, i) = \sum_{b}\frac{n_{u,b}}{log(1+n_{b}^{u})}\frac{n_{b,i}}{log(1+n_{i}^{u})}$

In [63]:
import os
import time
import math
import random
import numpy as np
import pandas as pd
from collections import defaultdict  
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print('Func %s, run time: %s' % (func.__name__, stop_time - start_time))
        return res

    return wrapper

In [64]:
class Dataset():
    def __init__(self, rating_path, item_path):
        """ 读取数据，构建数据集 """
        self.data = self.load_data(rating_path, item_path)
        
    def load_data(self, rating_path, item_path):
        df = pd.read_csv(rating_path, names=['user_id', 'item_id', 'rating', 'click_timestamp'], sep='::')
        item_df = pd.read_csv(item_path, names=['item_id', 'title', 'item_cate_id'], sep='::')
        # 对Item类别标签进行处理
        item_df['item_cate_id'] = item_df['item_cate_id'].map(lambda x: x.split('|'))
        # 数据合并
        df = pd.merge(df, item_df, on='item_id')
        return df
    
    def split_data(self, M, k, seed=1024):
        """ 数据集划分
        :param M 划分的折数
        :param k 第几次划分k∈[0, M)
        :return: train test
        """        
        self.data = self.data.sample(frac=1, random_state=seed)
        self.data.reset_index(inplace=True, drop=True)
        train = self.data[self.data.index % M != k]
        test = self.data[self.data.index % M == k]
        
        def convert_dict(data):
            """ 转化为字典形式
            :params data [(user1, item1), (user2, item2)]
            :return: {user1: [(item1, time1), (item2, time2)..]}
            """
            
            data = data.sort_values(['user_id', 'click_timestamp'])
            
            def make_item_tags_pair(df):
                return list(zip(df['item_id'], df['item_cate_id']))
            
            user_item_time_df = data.groupby('user_id')['item_id', 'item_cate_id'].apply(lambda x: make_item_tags_pair(x))\
                                                                    .reset_index().rename(columns={0: 'item_tags_list'})
            
            user_item_tags_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_tags_list']))
            return user_item_tags_dict
            
        return convert_dict(train), convert_dict(test)

##### TagBased

In [84]:
@timmer
def tag_based_rec(train, K, N):
    """ 基于兴趣标签推荐
    :param train 训练集
    :params: K TopK相似用户/物品
    :params: N 推荐物品个数TopN
    """
    user_tag_dict = {}
    tag_item_dict = {}
    
    def init_mat_dict(d, v1, v2):
        """ 统计d[v1][v2]数量 """
        if v1 not in d:
            d[v1] = {}
        if v2 not in d[v1]:
            d[v1][v2] = 0
        d[v1][v2] += 1
        return d
    
    for user_id, item_tags_list in tqdm(train.items()):
        # 位置 itemID 标签列表
        for loc, (item_id, i_tags) in enumerate(item_tags_list):
            for tag in i_tags:
                user_tag_dict = init_mat_dict(user_tag_dict, user_id, tag)
                tag_item_dict = init_mat_dict(tag_item_dict, tag, item_id)
    print('tag len:', len(tag_item_dict.keys()))
                
    def get_recommendation(user):
        """ 推荐用户未消费的前N个物品 """
        seen_items = set([i[0] for i in train[user]]) # 用户点击过得物品
        rec_item_dict = {}
        for tag, nub in user_tag_dict[user].items(): # 用户的兴趣标签
            for item, nbi in tag_item_dict[tag].items(): # 标签下的热门物品
                if item not in rec_item_dict:
                    rec_item_dict[item] = 0
                rec_item_dict[item] += nub * nbi # 用户点击标签次数 * 标签下的Item点击次数
        rec_items = {k: rec_item_dict[k] for k in rec_item_dict.keys() if k not in seen_items}
        rec_items = [item for item in sorted(rec_items.items(), key=lambda x:x[1], reverse=True)]
        return rec_items[:N]
    
    return get_recommendation

##### TagBasedTfidf

In [85]:
@timmer
def tag_based_tfidf_rec(train, K, N):
    """ 基于兴趣标签推荐
    :param train 训练集
    :params: K TopK相似用户/物品
    :params: N 推荐物品个数TopN
    """
    user_tag_dict = {}
    tag_item_dict = {}
    tag_user_dict = {}
    
    def init_mat_dict(d, v1, v2):
        """ 统计d[v1][v2]数量 """
        if v1 not in d:
            d[v1] = {}
        if v2 not in d[v1]:
            d[v1][v2] = 0
        d[v1][v2] += 1
        return d
    
    for user_id, item_tags_list in tqdm(train.items()):
        # 位置 itemID 标签列表
        for loc, (item_id, i_tags) in enumerate(item_tags_list):
            for tag in i_tags:
                user_tag_dict = init_mat_dict(user_tag_dict, user_id, tag)
                tag_item_dict = init_mat_dict(tag_item_dict, tag, item_id)
                tag_user_dict = init_mat_dict(tag_user_dict, tag, user_id)
    
    def get_recommendation(user):
        """ 推荐用户未消费的前N个物品 """
        seen_items = set([i[0] for i in train[user]]) # 用户点击过得物品
        rec_item_dict = {}
        for tag, nub in user_tag_dict[user].items(): # 用户的兴趣标签
            for item, nbi in tag_item_dict[tag].items(): # 标签下的热门物品
                if item not in rec_item_dict:
                    rec_item_dict[item] = 0
                rec_item_dict[item] += nub * nbi / np.log(1+len(tag_user_dict[tag])) # 用户点击标签次数 * 标签下的Item点击次数 / log(1+当前标签被点击的次数)
        
        rec_items = {k: rec_item_dict[k] for k in rec_item_dict.keys() if k not in seen_items}
        rec_items = [item for item in sorted(rec_items.items(), key=lambda x:x[1], reverse=True)]
        return rec_items[:N]
    
    return get_recommendation

##### TagBasedTfidf++

In [86]:
@timmer
def tag_based_tfidf_plus_rec(train, K, N):
    """ 基于兴趣标签推荐
    :param train 训练集
    :params: K TopK相似用户/物品
    :params: N 推荐物品个数TopN
    """
    user_tag_dict = {}
    tag_item_dict = {}
    tag_user_dict = {}
    item_user_dict = {}
    
    def init_mat_dict(d, v1, v2):
        """ 统计d[v1][v2]数量 """
        if v1 not in d:
            d[v1] = {}
        if v2 not in d[v1]:
            d[v1][v2] = 0
        d[v1][v2] += 1
        return d
    
    for user_id, item_tags_list in tqdm(train.items()):
        # 位置 itemID 标签列表
        for loc, (item_id, i_tags) in enumerate(item_tags_list):
            for tag in i_tags:
                user_tag_dict = init_mat_dict(user_tag_dict, user_id, tag)
                tag_item_dict = init_mat_dict(tag_item_dict, tag, item_id)
                tag_user_dict = init_mat_dict(tag_user_dict, tag, user_id)
            item_user_dict = init_mat_dict(item_user_dict, item_id, user_id)
                
    def get_recommendation(user):
        """ 推荐用户未消费的前N个物品 """
        seen_items = set([i[0] for i in train[user]]) # 用户点击过得物品
        rec_item_dict = {}
        for tag, nub in user_tag_dict[user].items(): # 用户的兴趣标签
            for item, nbi in tag_item_dict[tag].items(): # 标签下的热门物品
                if item not in rec_item_dict:
                    rec_item_dict[item] = 0
                rec_item_dict[item] += nub * nbi / np.log(1+len(tag_user_dict[tag])) / np.log(1+len(item_user_dict[item_id])) # 用户点击标签次数 * 标签下的Item点击次数 / log(1+当前标签被点击的次数)
        
        rec_items = {k: rec_item_dict[k] for k in rec_item_dict.keys() if k not in seen_items}
        rec_items = [item for item in sorted(rec_items.items(), key=lambda x:x[1], reverse=True)]
        return rec_items[:N]
    
    return get_recommendation

##### 模型评估

In [87]:
class Metric():
    def __init__(self, train, test, get_recommendation):
        """ 获取测试集所有用户的推荐结果，进行评估 """
        self.train = train
        self.test = test
        self.get_recommendation = get_recommendation
        self.rec_result = self.get_rec_result()
    
    def get_rec_result(self):
        """ 获取测试集推荐结果 """
        rec_result = {}
        for user in tqdm(self.test):
            rec_result[user] = self.get_recommendation(user)
        return rec_result
    
    # {user1: [(item1, time1), (item2, time2)..]}
    def precision(self):
        """ 精确率，命中的item占所有推荐item的比例 """
        _hit, _all = 0, 0
        for user in self.test:
            items = set([i[0] for i in self.test[user]])
            rank = self.rec_result[user]
            for item, _ in rank:
                if item in items:
                    _hit += 1
            _all += len(rank)
        return round(_hit / _all * 100, 2)
    
    def recall(self):
        """ 召回率，命中的item占所有真实点击item的比例 """
        _hit, _all = 0, 0
        for user in self.test:
            items = set([i[0] for i in self.test[user]])
            rank = self.rec_result[user]
            for item, _ in rank:
                if item in items:
                    _hit += 1
            _all += len(items)
        return round(_hit / _all * 100, 2)
    
    def coverage(self):
        """ 覆盖率，推荐的item占所有商品的比例 """
        all_items, rec_items = set(), set()
        for user in self.test:
            for item in set([i[0] for i in self.train[user]]):
                all_items.add(item)
            rank = self.rec_result[user]
            for item, _ in rank:
                rec_items.add(item)
        return round(len(rec_items) / len(all_items) * 100, 2)
        
    def popularity(self):
        """ 流行度，衡量商品热度的方式 """
        item_popularity_dict = {}
        for user in self.test:
            for item in [i[0] for i in self.train[user]]:
                if item not in item_popularity_dict:
                    item_popularity_dict[item] = 0
                item_popularity_dict[item] += 1
        
        _all, _p = 0, 0 # item数量 流行度
        for user in self.test:
            rank = self.rec_result[user]
            for item, _ in rank:
                _p += math.log(1+item_popularity_dict[item])
                _all += 1
        return round(_p / _all, 6)
        
    def eval(self):
        """ 评估测试集各项指标 """
        model_metric = {
            'Precision': self.precision(),
            'Recall': self.recall(),
            'Coverage': self.coverage(),
            'Popularity': self.popularity(),
        }
        print('Metric:', model_metric)
        return model_metric

class Experiment():
    def __init__(self, M, K, N, filepaths, algname):
        """
        :params: M 进行多少次实验
        :params: K TopK相似用户/物品
        :params: N 推荐物品个数TopN
        :params: filepath 数据路径
        :params: 算法名称
        """
        self.M = M
        self.K = K
        self.N = N
        self.filepaths = filepaths
        self.algname = algname
        self.alg = {
            "TagBased": tag_based_rec,
            "TagBasedTFIDF": tag_based_tfidf_rec,
            "TagBasedTFIDF++": tag_based_tfidf_plus_rec
        }
    
    @timmer
    def single_run(self, train, test):
        """
        :params: train 训练数据集
        :params: test 测试数据集
        :return: 各项指标
        """
        get_recommendation = self.alg[self.algname](train, self.K, self.N)
        metric = Metric(train, test, get_recommendation)
        return metric.eval()
    
    @timmer
    def run(self):
        dataset = Dataset(self.filepaths[0], self.filepaths[1])
        train, _test = dataset.split_data(self.M, k=1)
        # 只保留有过历史行为的用户进行测试
        test = {}
        for k, v in _test.items():
            if k in train.keys():
                test[k] = _test[k]
            else:
                print('del test user: ', k)
        metric = self.single_run(train, test)

In [88]:
# 将数据集划分为8折，根据K个相似项中推荐N个物品
M, N, K = 8, 10, 10
# rating_path item_path
file_paths = ['../data/ml-1m/ratings.dat', '../data/ml-1m/movies.dat']

# 模型评估
ALGS = ["TagBased", "TagBasedTFIDF", "TagBasedTFIDF++"]

for alg in ALGS:
    print(f'=============== {alg} START ===============')
    exp = Experiment(M, K, N, file_paths, alg)
    exp.run()
    print(f'=============== {alg} END ===============\n')



  0%|          | 0/6040 [00:00<?, ?it/s]

tag len: 18
Func tag_based_rec, run time: 1.2167139053344727


  0%|          | 0/5996 [00:00<?, ?it/s]

Metric: {'Precision': 15.0, 'Recall': 7.2, 'Coverage': 4.58, 'Popularity': 7.530587}
Func single_run, run time: 18.869747161865234
Func run, run time: 25.184858798980713



  0%|          | 0/6040 [00:00<?, ?it/s]

Func tag_based_tfidf_rec, run time: 1.6958022117614746


  0%|          | 0/5996 [00:00<?, ?it/s]

Metric: {'Precision': 15.01, 'Recall': 7.2, 'Coverage': 4.58, 'Popularity': 7.53034}
Func single_run, run time: 81.86057543754578
Func run, run time: 88.30936527252197



  0%|          | 0/6040 [00:00<?, ?it/s]

Func tag_based_tfidf_plus_rec, run time: 2.4119913578033447


  0%|          | 0/5996 [00:00<?, ?it/s]

Metric: {'Precision': 15.01, 'Recall': 7.2, 'Coverage': 4.58, 'Popularity': 7.53034}
Func single_run, run time: 133.32245087623596
Func run, run time: 139.84999632835388



##### 评估指标

| Mode  | 场景 |  优点 |  缺点 | 精确率 | 召回率 | 覆盖率 | 流行度 |
| :-------------------: |:-------------------: |:-------------------: |:-------------------: |:-------------------: |:-------------------: |:-------------------: |:-------------------: |
| **TagBased** | feed流推荐场景 | 可以建立tag倒排索引，加速线上召回速度。 | 依赖于tag效果。 |  15.0 | 7.2 | 4.58 | 7.531 |
| **TagBasedTFIDF** | - | - | - | 15.01 | 7.2 | 4.58 | 7.530 |
| **TagBasedTFIDF++** | - | - | - |  15.01 | 7.2 | 4.58 | 7.530 |

因为受Movielens数据集标签数量影响(仅18个标签)，TagBased相比其他几种协同过滤算法的CP要差一点，但相比基于热度的推荐CP表现更好。