In [1]:
import numpy as np

In [2]:
def loadExData():
    return[[0, 0, 0, 2, 2],
           [0, 0, 0, 3, 3],
           [0, 0, 0, 1, 1],
           [1, 1, 1, 0, 0],
           [2, 2, 2, 0, 0],
           [5, 5, 5, 0, 0],
           [1, 1, 1, 0, 0]]

In [3]:
def loadExData2():
    return[[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5],
           [0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3],
           [0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0],
           [3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0],
           [5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0],
           [0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0],
           [4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1],
           [0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4],
           [0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2],
           [0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0],
           [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]]

### 相似度计算

In [4]:
# 欧式距离
def euclidSim(inA, inB):
    return 1.0 / (1.0 + np.linalg.norm(inA - inB))

In [5]:
# 皮尔逊相关系数（Pearson correlation）
def pearsSim(inA, inB):
    if len(inA) < 3:
        return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar = 0)[0][1]

In [6]:
# 余弦相似度（cosine similarity）
def cosSim(inA, inB):
    num = float(inA.T * inB)
    denom = np.linalg.norm(inA) * np.linalg.norm(inB)
    return 0.5 + 0.5 * (num / denom)

## 餐馆菜肴推荐系统
1. 寻找用户没有评级的菜肴，即用户-物品矩阵中的0值  
2. 在用户没有评级的物品中，对每个物品预计一个可能的评级分数  
3. 对这些物品的评分从高到低排序，返回前N个物品

In [7]:
# 估计评分值
def standEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]    # 得到物品数目
    simTotal = 0.0
    ratSimTotal = 0.0
    for j in range(n):    # 遍历所有物品
        userRating = dataMat[user, j]
        if userRating == 0:    # 如果没有过评分，跳过
            continue
        overLap = np.nonzero(np.logical_and(dataMat[:, item].A > 0, dataMat[:,j].A > 0))[0]    # 给出两个物品被评分的元素
        if len(overLap) == 0:    # 如果没有重合元素，则相似度为0
            similarity = 0
        else:
            similarity = simMeas(dataMat[overLap, item], dataMat[overLap, j])    # 存在重合物品，基于重合物品计算相似度
        #print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal    # 返回归一化结果

In [8]:
# 推荐引擎
def recommend(dataMat, user, N = 3, simMeas = cosSim, estMethod = standEst):
    unratedItems = np.nonzero(dataMat[user, :].A == 0)[1]    # 寻找未评级物品
    if len(unratedItems) == 0:
        return 'you rated everything'
    itemScores = []
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)    # 预测得分
        itemScores.append((item, estimatedScore))
    return sorted(itemScores, key = lambda jj: jj[1], reverse = True)[:N]    # 排序并返回

In [9]:
myMat = np.mat(loadExData())
myMat[0, 1] = myMat[0, 0] = myMat[1, 0] = myMat[2, 0] = 4
myMat[3, 3] = 2
myMat

matrix([[4, 4, 0, 2, 2],
        [4, 0, 0, 3, 3],
        [4, 0, 0, 1, 1],
        [1, 1, 1, 2, 0],
        [2, 2, 2, 0, 0],
        [5, 5, 5, 0, 0],
        [1, 1, 1, 0, 0]])

In [10]:
recommend(myMat, 2)

[(2, 2.5), (1, 2.0243290220056256)]

In [11]:
recommend(myMat, 2, simMeas = euclidSim)

[(2, 3.0), (1, 2.8266504712098603)]

In [12]:
recommend(myMat, 2, simMeas = pearsSim)

[(2, 2.5), (1, 2.0)]

In [13]:
U, Sigma, VT = np.linalg.svd(np.mat(loadExData2()))
Sigma

array([15.77075346, 11.40670395, 11.03044558,  4.84639758,  3.09292055,
        2.58097379,  1.00413543,  0.72817072,  0.43800353,  0.22082113,
        0.07367823])

In [14]:
Sig2= Sigma**2
sum(Sig2)

541.9999999999994

In [15]:
sum(Sig2) * 0.9

487.7999999999995

In [16]:
sum(Sig2[:2])

378.8295595113579

In [17]:
sum(Sig2[:3])

500.5002891275793

### 基于SVD的评分估计

In [18]:
def svdEst(dataMat, user, simMeas, item):
    n = np.shape(dataMat)[1]    # 得到物品数目
    simTotal = 0.0
    ratSimTotal = 0.0
    U, Sigma, VT = np.linalg.svd(dataMat)    # 对数据进行SVD分解
    Sig4 = np.mat(np.eye(4) * Sigma[:4])    # 建立对角矩阵
    xformedItems = dataMat.T * U[:, :4] * Sig4.I    # 构建转换后的物品
    for j in range(n):    # 遍历所有物品
        userRating = dataMat[user, j]
        if userRating == 0 or j == item:
            continue
        similarity = simMeas(xformedItems[item, :].T, xformedItems[j, :].T)    # 计算相似度
        print('the %d and %d similarity is: %f' % (item, j, similarity))
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0:
        return 0
    else:
        return ratSimTotal / simTotal    # 返回归一化结果

In [19]:
recommend(myMat, 1, estMethod = svdEst)

the 1 and 0 similarity is: 0.498142
the 1 and 3 similarity is: 0.498131
the 1 and 4 similarity is: 0.509974
the 2 and 0 similarity is: 0.552670
the 2 and 3 similarity is: 0.552976
the 2 and 4 similarity is: 0.217301


[(2, 3.4177569186592387), (1, 3.3307171545585645)]

In [20]:
recommend(myMat, 1, estMethod = svdEst, simMeas = pearsSim)

the 1 and 0 similarity is: 0.280908
the 1 and 3 similarity is: 0.948940
the 1 and 4 similarity is: 0.606039
the 2 and 0 similarity is: 0.556905
the 2 and 3 similarity is: 0.552447
the 2 and 4 similarity is: 0.214974


[(2, 3.4205197987866813), (1, 3.153009368370128)]

## 基于SVD的图像压缩

In [21]:
def printMat(inMat, thresh = 0.8):
    for i in range(32):
        for k in range(32):
            if float(inMat[i, k]) > thresh:
                print(1, end=''),
            else:
                print(0, end=''),
        print('')

In [22]:
def imgCompress(numSV = 3, thresh = 0.8):
    myl =[]
    for line in open('0_5.txt').readlines():
        newRow = []
        for i in range(32):
            newRow.append(int(line[i]))
        myl.append(newRow)
    myMat = np.mat(myl)
    print('**** original matrix ****')
    printMat(myMat, thresh)
    U, Sigma, VT = np.linalg.svd(myMat)
    SigRecon = np.mat(np.zeros((numSV, numSV)))
    for k in range(numSV):
        SigRecon[k, k] = Sigma[k]
    reconMat = U[:, :numSV] * SigRecon * VT[:numSV, :]
    print('**** reconstructed matrix using %d singular values ****' % numSV)
    printMat(reconMat, thresh)

In [23]:
imgCompress(2)

**** original matrix ****
00000000000000110000000000000000
00000000000011111100000000000000
00000000000111111110000000000000
00000000001111111111000000000000
00000000111111111111100000000000
00000001111111111111110000000000
00000000111111111111111000000000
00000000111111100001111100000000
00000001111111000001111100000000
00000011111100000000111100000000
00000011111100000000111110000000
00000011111100000000011110000000
00000011111100000000011110000000
00000001111110000000001111000000
00000011111110000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000011111100000000001111000000
00000001111100000000001111000000
00000001111100000000011111000000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000001111100000
00000000111110000000011111000000
00000000111110000000111111000000
00000000111111000001111110000000
00000000011111111111111110000000
00000000001111111111111110000000
00000000001111111111111110000000
00000000000111111