In [1]:
# -*- coding: utf-8 -*-
#使用SimpleTagBased算法进行TOP-N推荐

In [2]:
#常用方法有
# 1. simpleTag
# 2. norm Tag
# 3. tf-idf Tag

In [3]:
# Step1 Load Data
# Step2 Split train / test data
# Step3 Prepare for Dict
# Step4 Test Recommend

In [4]:
import pandas as pd
import random
import seaborn as sns

In [5]:
import operator
import math

In [6]:
pd.set_option('display.max_column',10)
pd.set_option('display.max_row',500)

In [7]:
records = {}
# 训练集，测试集
train_data = dict()
test_data = dict()
# 用户标签，商品标签
user_tags = dict()
tag_items = dict()
user_items = dict()

tag_users = dict()

In [8]:
# df , uid 等临时变量用完就drop了
# records是组合变量 会被自动global

In [9]:
df = pd.read_csv("./user_taggedbookmarks-timestamps.dat",delimiter='\t')

In [10]:
def load_data():
    df = pd.read_csv("./user_taggedbookmarks-timestamps.dat",delimiter='\t')
    df.reset_index(drop=True,inplace=True)
    try:
        for i in range(len(df)):
            uid = df['userID'][i]
            iid = df['bookmarkID'][i]
            tag = df['tagID'][i]
            # 键不存在时，设置默认值{}
            records.setdefault(uid,{})      #  records set key by uid
            records[uid].setdefault(iid,[]) #  records [uid] set key by iid
            records[uid][iid].append(tag)   #  records [uid] [iid] set tag
            
    except:
        print("sth wrong")
        pass
    print("数据集大小为 %d." % (len(df)))
    print("设置tag的人数 %d." % (len(records)))
    print("数据加载完成\n")

In [11]:
# 将数据集拆分为训练集和测试集
def train_test_split(ratio, seed=100):
    random.seed(seed)
    for u in records.keys():
        for i in records[u].keys():
            # ratio比例设置为测试集
            if random.random()<ratio:
                test_data.setdefault(u,{})
                #print(len(test_data))
                test_data[u].setdefault(i,[])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
    print("训练集样本数 %d, 测试集样本数 %d" % (len(train_data),len(test_data)))

In [12]:
# 设置矩阵 mat[index, item] = 1
def addValueToMat(mat, index, item, value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value


# 使用训练集，初始化user_tags, tag_items, user_items
def initStat():
    records=train_data
    for u,items in records.items():
        for i,tags in items.items():
            for tag in tags:
                #print tag
                # 用户和tag的关系
                addValueToMat(user_tags, u, tag, 1)
                # tag和item的关系
                addValueToMat(tag_items, tag, i, 1)
                # 用户和item的关系
                addValueToMat(user_items, u, i, 1)
                
                # Tag和User的关系
                addValueToMat(tag_users, tag, u, 1)
                
    print("user_tags, tag_items, user_items初始化完成.")
    print("user_tags大小 %d, tag_items大小 %d, user_items大小 %d" % (len(user_tags), len(tag_items), len(user_items)))


In [13]:
# 使用测试集，计算准确率和召回率
def precisionAndRecall(N,method):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user,items in test_data.items():
        if user not in train_data:
            continue
        # 获取Top-N推荐列表
        if method=='simple':
            rank = recommend(user, N)
        elif method=='normal':
            rank = recommendNormal(user, N)
        elif method=='tag-tfidf':
            rank = recommendTag_tfidf(user, N)
        else:
            print("something wrong in precisionAndRecall")
        for item,rui in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
    #print('一共命中 %d 个, 一共推荐 %d 个, 用户设置tag总数 %d 个' %(hit, h_precision, h_recall))
    # 返回准确率 和 召回率
    return (hit/(h_precision*1.0)), (hit/(h_recall*1.0))


In [14]:
### SimpleTagBased
# 对用户user推荐Top-N
def recommend(user, N):
    recommend_items=dict()
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut, 乘以 商品被打上相同标签的次数 wti）之和
    tagged_items = user_items[user]     
    for tag, wut in user_tags[user].items():
        #print(self.user_tags[user].items())
        for item, wti in tag_items[tag].items():
            if item in tagged_items:
                continue
            #print('wut = %s, wti = %s' %(wut, wti))
            if item not in recommend_items:
                recommend_items[item] = wut * wti
            else:
                recommend_items[item] = recommend_items[item] + wut * wti
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]


In [15]:
### NormalTagBased
# 对用户user推荐Top-N
def recommendNormal(user, N):
    recommend_items=dict()
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut, 乘以 商品被打上相同标签的次数 wti）之和
    tagged_items = user_items[user]     
    for tag, wut in user_tags[user].items():
        #print(self.user_tags[user].items())
        for item, wti in tag_items[tag].items():
            if item in tagged_items:
                continue
            #print('wut = %s, wti = %s' %(wut, wti))
            if item not in recommend_items:
                recommend_items[item] = wut / len(user_tags[user]) * wti / len(tag_items[tag])
            else:
                recommend_items[item] = recommend_items[item] + wut / len(user_tags[user]) * wti / len(tag_items[tag])
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]


In [16]:
#tag_users 可以在解析csv的时候创建， 也可以事后创建  -- 此处放在init 里处理

#是否要把tags_users 分train 和 test 集？ -- 代码未分

In [17]:
### TagBased TF-IDF
# 对用户user推荐Top-N
def recommendTag_tfidf(user, N):
    recommend_items=dict()
    # 对Item进行打分，分数为所有的（用户对某标签使用的次数 wut, 乘以 商品被打上相同标签的次数 wti）之和
    tagged_items = user_items[user]   
    for tag, wut in user_tags[user].items():
        #print(self.user_tags[user].items())
        for item, wti in tag_items[tag].items():
            if item in tagged_items:
                #print('wut = %s, wti = %s' %(wut, wti))
                continue # 没理解这里为什么要判断
            if item not in recommend_items:
                recommend_items[item] = wut / math.log(1+ len(tag_users[tag])) * wti
            else:
                recommend_items[item] = recommend_items[item] + wut / math.log(1+ len(tag_users[tag])) * wti
    return sorted(recommend_items.items(), key=operator.itemgetter(1), reverse=True)[0:N]


In [18]:
# 使用测试集，对推荐结果进行评估
def testRecommend(method):
    if method in ['simple','normal','tag-tfidf']:
        print("推荐结果评估")
        print("%3s %10s %10s" % ('N',"精确率",'召回率'))
        for n in [5,10,20,40,60,80,100]:
            precision,recall = precisionAndRecall(n,method)
            print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))
    else:
        print("Only simple/normal/tag-tfidf key words are supported")
    

In [19]:
load_data()

数据集大小为 437593.
设置tag的人数 1867.
数据加载完成



In [20]:
train_test_split(0.7)

训练集样本数 1810, 测试集样本数 1856


In [21]:
initStat()

user_tags, tag_items, user_items初始化完成.
user_tags大小 1810, tag_items大小 21982, user_items大小 1810


In [22]:
testRecommend(method='tag-tfidf')

推荐结果评估
  N        精确率        召回率
  5      1.868%      0.229%
 10      1.518%      0.373%
 20      1.137%      0.558%
 40      0.839%      0.824%
 60      0.690%      1.017%
 80      0.606%      1.190%
100      0.548%      1.346%
