In [1]:
import random
import math
import operator
import pandas as pd

In [2]:
file_path = "./user_taggedbookmarks-timestamps.dat"
# 字典类型，保存了user对item的tag，即{userid: {item1:[tag1, tag2], ...}}
records = {}
# 训练集，测试集
train_data = dict()
test_data = dict()
# 用户标签，商品标签
user_tags = dict()
tag_items = dict()
user_items = dict()
tag_users = dict()

In [3]:
# 数据加载
def load_data():
    df = pd.read_csv(file_path, sep='\t')
    for i in range(len(df)):
        uid = df['userID'][i]
        iid = df['bookmarkID'][i]
        tag = df['tagID'][i]
        # 键不存在时，设置默认值{}
        records.setdefault(uid,{})
        records[uid].setdefault(iid,[])
        records[uid][iid].append(tag)

In [4]:
# 将数据集拆分为训练集和测试集
def train_test_split(ratio, seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            # ratio比例设置为测试集
            if random.random()<ratio:
                test_data.setdefault(u,{})
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)        

In [5]:
# 设置矩阵 mat[index, item] = 1
def addValueToMat(mat, index, item, value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

In [6]:
def initStat():
    records=train_data
    for u,items in records.items():
        for i,tags in items.items():
            for tag in tags:
                #print tag
                # 用户和tag的关系
                addValueToMat(user_tags, u, tag, 1)
                # tag和item的关系
                addValueToMat(tag_items, tag, i, 1)
                # 用户和item的关系
                addValueToMat(user_items, u, i, 1)
                # tag和不同用户的关系
                addValueToMat(tag_users, tag, u, 1)


In [7]:
def recommend(user, N):
    recommend_items=dict()
    tagged_items = user_items[user]
    if choice == 'NormTagBased':
        user_tags_u = sum(user_tags[user].values()) # 用户u对所有标签的使用次数之和
        for tag, wut in user_tags[user].items():
            tag_items_t = sum(tag_items[tag].values()) # 所有被打上t标签的商品的标签t次数之和
            for item, wti in tag_items[tag].items():
                if item in tagged_items:
                    continue
                if item not in recommend_items:
                    recommend_items[item] = (wut / user_tags_u) * (wti / tag_items_t)
                else:
                    recommend_items[item] = recommend_items[item] + (wut / user_tags_u) * (wti / tag_items_t)
        return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]
    elif choice == 'TagBased-TDIDF':
        for tag, wut in user_tags[user].items():
            wtu_sum = len(tag_users[tag]) # 标签tag被多少不同用户使用
            for item, wti in tag_items[tag].items():
                if item in tagged_items:
                    continue
                if item not in recommend_items:
                    recommend_items[item] = (wut / math.log(1 + wtu_sum)) * wti
                else:
                    recommend_items[item] = recommend_items[item] + (wut / math.log(1 + wtu_sum)) * wti
        return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]

In [8]:
# 使用测试集，计算准确率和召回率
def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user,items in test_data.items():
        if user not in train_data:
            continue
        # 获取Top-N推荐列表
        rank = recommend(user, N)
        for item,rui in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
    # 返回准确率 和 召回率
    return (hit/(h_precision*1.0)), (hit/(h_recall*1.0))

In [9]:
# 使用测试集，对推荐结果进行评估
def testRecommend():
    print("推荐结果评估")
    print("%3s %10s %10s" % ('N',"精确率",'召回率'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precisionAndRecall(n)
        print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [10]:
load_data()
train_test_split(0.2)
initStat()

In [11]:
choice = 'NormTagBased'
testRecommend()

推荐结果评估
  N        精确率        召回率
  5      0.717%      0.307%
 10      0.526%      0.451%
 20      0.412%      0.705%
 40      0.293%      1.002%
 60      0.244%      1.256%
 80      0.225%      1.544%
100      0.214%      1.831%


In [12]:
choice = 'TagBased-TDIDF'
testRecommend()

推荐结果评估
  N        精确率        召回率
  5      1.008%      0.431%
 10      0.761%      0.652%
 20      0.549%      0.940%
 40      0.402%      1.376%
 60      0.328%      1.687%
 80      0.297%      2.033%
100      0.269%      2.306%
