# 餐馆菜肴推荐引擎

In [1]:
# coding=utf-8

#导入numpy库
from numpy import *
from numpy import linalg as la

#导入原始数据
def loadExData():
     return [[1,1,1,0,0],
             [2,2,2,0,0],
             [5,5,5,0,0],
             [1,1,0,2,2],
             [0,0,0,3,3],
             [0,0,0,1,1]]

In [2]:
#通过欧式距离来计算相似度
def eulidSim(inA, inB):
    return 1.0/(1.0+la.norm(inA - inB))

#通过皮尔逊相关系数来计算相似度
def pearsSim(inA, inB):
    if len(inA)<3:
        return 1.0
    else:
        return 0.5+0.5*corrcoef(inA, inB, rowvar = 0)[0][1]

#余弦相似度
def cosSim(inA, inB):
    num   = float(inA.T*inB)            #向量inA和向量inB点乘,得cos分子
    denom = la.norm(inA)*la.norm(inB)   #向量inA,inB各2范数相乘，得cos分母
    return 0.5+0.5*(num/denom)         

In [3]:
#计算在给定相似度计算方法下用户对物品的估计评分值
def standEst(dataMat, user, simMeas, item):     #数据矩阵，用户编号，相似度计算方法，物品编号
    n = shape(dataMat)[1]                       #行对应用户，列对应物品，n即物品数目
    simTotal    = 0.0
    ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user, j]           #j为某个物品编号
        if userRating == 0:
            continue
        else:
            #寻找两个用户都评级的物品
            overLap = nonzero(logical_and(dataMat[:, item].A>0, dataMat[:,j].A>0))[0]#nonzero(a)返回数组a中值不为零的元素的下标为向量
            if len(overLap)== 0:
                similarity = 0
            else:
                similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
            simTotal += similarity
            ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal/simTotal

In [4]:
#推荐引擎，会调用standEst()函数，产生用户对不同物品的预测评分
def recommend(dataMat, user, N=4, simMeas=cosSim, estMethod=standEst):#输入用户编号，返回用户对前N个未评级物品预测评分值
    unratedItems = nonzero(dataMat[user, :].A==0)[1]                 #寻找未评级的物品，nonzero()[1]返回参数的某些为0的列的编号，dataMat中用户user对某个商品评价为0的列
    if len(unratedItems) == 0:
        return "you rated everything"
    else:
        itemScores = []
        for item in unratedItems:
            estimatedScore = estMethod(dataMat, user, simMeas, item)#对未评价的物品item进行进行预测评分，传入函数standEst
            itemScores.append((item,estimatedScore))
        return sorted(itemScores, key=lambda e:e[1], reverse = True)[:N]#前N个未评级物品
    
 
myMat = mat(loadExData())
myMat[0,3] = myMat[0,4] = myMat[1,4] = myMat[2,3] = 4                  #对矩阵稍作修改来展示推荐系统
myMat[4,1] = 2
print (myMat)
print (recommend(myMat, 4))
print (recommend(myMat, 4,simMeas = eulidSim))
print (recommend(myMat, 4,simMeas = pearsSim))

[[1 1 1 4 4]
 [2 2 2 0 4]
 [5 5 5 4 0]
 [1 1 0 2 2]
 [0 2 0 3 3]
 [0 0 0 1 1]]
[(0, 2.65436970132384), (2, 2.654023149824193)]
[(2, 2.3138382645201254), (0, 2.3067875486559717)]
[(2, 2.6666666666666665), (0, 2.6)]


In [5]:
#利用SVD进行提高推荐效果
def loadExData2():
        return [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
                [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
                [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0], 
                [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
                [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
                [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
                [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
                [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
                [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
                [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
                [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]
 
U,Sigma,VT = la.svd(mat(loadExData2()))
print (Sigma)
Sig2 =  Sigma**2
print (Sig2)
print (sum(Sig2)*0.9)
print (sum(Sig2[:2]))
print (sum(Sig2[:3]))

[15.77075346 11.40670395 11.03044558  4.84639758  3.09292055  2.58097379
  1.00413543  0.72817072  0.43800353  0.22082113  0.07367823]
[2.48716665e+02 1.30112895e+02 1.21670730e+02 2.34875695e+01
 9.56615756e+00 6.66142570e+00 1.00828796e+00 5.30232598e-01
 1.91847092e-01 4.87619735e-02 5.42848136e-03]
487.7999999999998
378.82955951135796
500.50028912757944


In [9]:
#基于SVD的评分估计
def SVDEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0
    ratSimTotal = 0.0
    U, Sigma, VT = la.svd(dataMat)
    Sig4 = mat(eye(4)*Sigma[:4]) #化为对角阵，或者用linalg.diag()函数可破
    xformedItems = dataMat.T*U[:,:4]*Sig4.I#构造转换后的物品
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0 or j == item:
            continue
        else:
            similarity = simMeas(xformedItems[item,:].T, xformedItems[j, :].T)
            print ("the %d and %d similarity is: %f" %(item,j,similarity))
            simTotal += similarity
            ratSimTotal += similarity*userRating
            if simTotal ==0 :
                return 0
            else:
                return ratSimTotal/simTotal
 
myMat = mat(loadExData2())
print (recommend(myMat, 1, estMethod = SVDEst))
print (recommend(myMat, 1, estMethod = SVDEst, simMeas = pearsSim))

the 0 and 3 similarity is: 0.490950
the 1 and 3 similarity is: 0.491294
the 2 and 3 similarity is: 0.491573
the 4 and 3 similarity is: 0.450495
the 6 and 3 similarity is: 0.743699
the 7 and 3 similarity is: 0.482175
the 8 and 3 similarity is: 0.491307
the 9 and 3 similarity is: 0.522379
[(1, 3.0000000000000004), (4, 3.0000000000000004), (0, 3.0), (2, 3.0)]
the 0 and 3 similarity is: 0.341942
the 1 and 3 similarity is: 0.345560
the 2 and 3 similarity is: 0.345149
the 4 and 3 similarity is: 0.450126
the 6 and 3 similarity is: 0.923822
the 7 and 3 similarity is: 0.319482
the 8 and 3 similarity is: 0.334910
the 9 and 3 similarity is: 0.566918
[(2, 3.0000000000000004), (0, 3.0), (4, 3.0), (7, 3.0)]


In [7]:
# 基于物品相似度的推荐引擎
def standEst(dataMat, user, simMeas, item):
    """standEst(计算某用户未评分物品中，以对该物品和其他物品评分的用户的物品相似度，然后进行综合评分)

    Args:
        dataMat         训练数据集
        user            用户编号
        simMeas         相似度计算方法
        item            未评分的物品编号
    Returns:
        ratSimTotal/simTotal     评分（0～5之间的值）
    """
    # 得到数据集中的物品数目
    n = shape(dataMat)[1]
    # 初始化两个评分值
    simTotal = 0.0
    ratSimTotal = 0.0
    # 遍历行中的每个物品（对用户评过分的物品进行遍历，并将它与其他物品进行比较）
    for j in range(n):
        userRating = dataMat[user, j]
        # 如果某个物品的评分值为0，则跳过这个物品
        if userRating == 0:
            continue
        # 寻找两个用户都评级的物品
        # 变量 overLap 给出的是两个物品当中已经被评分的那个元素的索引ID
        # logical_and 计算x1和x2元素的真值。
        overLap = nonzero(logical_and(dataMat[:, item].A > 0, dataMat[:, j].A > 0))[0]
        # 如果相似度为0，则两着没有任何重合元素，终止本次循环
        if len(overLap) == 0:
            similarity = 0
        # 如果存在重合的物品，则基于这些重合物重新计算相似度。
        else:
            similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])
        # print 'the %d and %d similarity is : %f'(iten,j,similarity)
        # 相似度会不断累加，每次计算时还考虑相似度和当前用户评分的乘积
        # similarity  用户相似度，   userRating 用户评分
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    # 通过除以所有的评分总和，对上述相似度评分的乘积进行归一化，使得最后评分在0~5之间，这些评分用来对预测值进行排序
    else:
        return ratSimTotal/simTotal