# CH14 SVD

## Dependencies

In [None]:
import pandas as pd
import numpy as np
from numpy import linalg as la


SVD (Singular Value Decomposition)奇异值分解
- 隐性语义索引（Latent Semantic Indexing, LSI）或者 隐性语义分析(Latent Semantic Analysis, LSA)
- 推荐系统


- PCA -> 特征值
- SVD -> 奇异值


## MLiA

### SVD Applications

#### LSI


在模式识别和信息检索中，SVD在降维方面有广泛应用，被称为是浅语义索引的基础。

#### RS
特征生成，将数据映射成重要特征。低维的数据饱含了整个数据的大部分能量

### Matrix Decomposition

In [None]:
U, sigma, VT = np.linalg.svd([[1, 1],[7, 7]])

In [None]:
U, sigma, VT

### CF

collaborative filtering

#### **Similarity**
- 欧式距离
    - [0, 1]
- Jaccard相似度
- 皮尔逊相关系数
     - 量级不敏感
     - [-1, 1]
- 余弦相似度
    - [-1, 1]

In [None]:
def ecludSim(inA, inB):
    return 1/(1 + la.norm(inA -inB))

def pearsSim(inA, inB):
    if len(inA) < 3:
        return 1
    return 0.5+0.5*np.corrcoef(inA, inB, rowvar=0)[0][1]

def cosSim(inA, inB):
    num = float(inA.T*inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5+0.5*(num/denom)

In [None]:
X = np.mat([[1, 1, 1, 0, 0],
            [2, 2, 2, 0, 0],
            [1, 1, 1, 0, 0],
            [5, 5, 5, 0, 0],
            [1, 1, 0, 2, 2],
            [0, 0, 0, 3, 3],
            [0, 0, 0, 1, 1]])


In [None]:
X[:, 0]

In [None]:
ecludSim(X[:, 0], X[:, 4]), ecludSim(X[:, 0], X[:, 0])

In [None]:
pearsSim(X[:, 0], X[:, 4]), pearsSim(X[:, 0], X[:, 0])

In [None]:
cosSim(X[:, 0], X[:, 4]), cosSim(X[:, 0], X[:, 0])

Item based recommondation

#### Item or User

- user-based CF
- item-based CF

如果用户数量很多，往往通过Item Similarity来实现推荐

#### Cost
- 打分 : RMSE
- TopN : Precision 和 Recall
- 覆盖率 : 熵
- 多样性 : 

### Ex : 餐饮推荐

#### Scoring for unscored

##### copy

In [None]:
X.shape

In [None]:
mX = X

In [None]:
mX[0,1] = 4

In [None]:
X, mX

In [None]:
mX = X.copy()

In [None]:
mX[0, 1] = 5
mX, X

In [None]:
mX[0, 1] = mX[0, 0]= mX[1, 0]=mX[2, 0]=4
mX[3, 3] = 2

##### modified X

In [None]:
mX

In [None]:
mX[:,1],mX[:,1].A

In [None]:
mX[0,1]

##### overLaped users
overlap means which user have scores at item and j both

In [None]:
item = 1
j = 2
overLap = np.nonzero(np.logical_and(mX[:, item].A>0,
                                    mX[:, j].A>0))[0]
mX[:, item].A,mX[:,j].A, overLap

In [None]:
def standEst(dataMat, user, simMeas, item):
    n = dataMat.shape[1]
    simTotal = 0
    ratSimTotal = 0
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0:
            continue
        # 筛选用户： 在item 和 j两列都有值的用户
        overLap = np.nonzero(np.logical_and(dataMat[:, item].A>0,
                                            dataMat[:, j].A>0))[0]
        if len(overLap) == 0:
            similarity = 0
        else:
            # 通过都有值的用户数据，计算item和j列的相似度
            similarity = simMeas(dataMat[overLap, item],
                                 dataMat[overLap, j])
            print("the %d and %d similarity is : %f" % (item, j, similarity))
            # item-based，基于item之间的相似度，通过其他item的打分，获取打分
            simTotal += similarity
            # 基于相似度，通过用户给j列的打分，预测给item的打分
            ratSimTotal += similarity*userRating
    return 0 if simTotal == 0 else ratSimTotal/simTotal

##### nonzero

In [None]:
mX

In [None]:
user =  1
mX[user,:].A == 0

In [None]:
mX[user,:]

In [None]:
np.nonzero(mX[user,:].A == 0)

In [None]:
user =  4
mX[user,:].A == 0

In [None]:
np.nonzero(mX[user,:].A == 0)

##### recommend

In [None]:
def recommend(dataMat, user, N=3, simMeas=cosSim, estMethod=standEst):
    unratedItems = np.nonzero(dataMat[user,:].A==0)[1]
    if len(unratedItems) == 0: 
        return "you rated everything"
    itemScores = [(item, estMethod(dataMat, user, simMeas, item)) for item in unratedItems]
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]
        

In [None]:
recommend(mX, 2)

#### SVD

In [None]:
df = pd.read_csv("./Data/CH14/data_14-4.csv",index_col=0)
df

##### SVD
转化成低维矩阵计算相似度

In [None]:
X = np.mat(df.values)
X

In [None]:
U, Sigma, VT = la.svd(X)
Sigma

In [None]:
sum(Sigma),sum(Sigma)*.9

In [None]:
sum(Sigma[:2]),sum(Sigma[:3]),sum(Sigma[:4]),sum(Sigma[:5]),sum(Sigma[:6]),sum(Sigma[:7])

##### Sigma

In [None]:
Sig4 = np.mat(np.eye(4)*Sigma[:4])
Sig4

In [None]:
Sig5 = np.mat(np.eye(5)*Sigma[:5])
Sig5

In [None]:
Sig5.I

##### Transformed

In [None]:
xformedItems = X.T*U[:,:5]*Sig5.I
xformedItems

In [None]:
def svdEst(dataMat, user, simMeas, item):
    n = dataMat.shape[1]
    simTotal = 0
    ratSimTotal = 0
    U, Sigma, VT = la.svd(dataMat)
    n_Sig = 6
    Sig = np.mat(np.eye(n_Sig)*Sigma[:n_Sig])
    xformedItems = dataMat.T*U[:,:n_Sig]*Sig.I
    print(dataMat[user])
    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0 or j==item:
            continue
        # 通过都有值的用户数据，计算item和j列的相似度
        similarity = simMeas(xformedItems[item,:].T,
                             xformedItems[j,:].T)
        print("the %d and %d similarity is : %f" % (item, j, similarity))
        # item-based，基于item之间的相似度，通过其他item的打分，获取打分
        simTotal += similarity
        # 基于相似度，通过用户给j列的打分，预测给item的打分
        ratSimTotal += similarity*userRating
    return 0 if simTotal == 0 else ratSimTotal/simTotal

##### recommend

In [None]:
recommend(X,0,estMethod=svdEst)

In [None]:
recommend(X,1,estMethod=svdEst)

#### Summary

- 大数据集SVD操作低频运行
- Sparse
- Similarity 存储
- Cold-Start 推荐转化成搜索

### Ex : 图像压缩

## Sklearn

In [None]:
from skimage import data, io, filters
image = data.coins()
# ... or any other NumPy array!
edges = filters.sobel(image)
io.imshow(edges)
io.show()

## SURPRISE

In [5]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9400  0.9384  0.9348  0.9337  0.9427  0.9379  0.0033  
MAE (testset)     0.7408  0.7390  0.7366  0.7342  0.7434  0.7388  0.0032  
Fit time          6.78    6.94    6.82    6.86    6.95    6.87    0.07    
Test time         0.28    0.22    0.21    0.26    0.22    0.24    0.03    


{'fit_time': (6.782602787017822,
  6.9437360763549805,
  6.823808670043945,
  6.861783027648926,
  6.952728748321533),
 'test_mae': array([0.74076198, 0.73902482, 0.73659828, 0.73415997, 0.7433679 ]),
 'test_rmse': array([0.93996987, 0.93836471, 0.93479145, 0.93374258, 0.94273326]),
 'test_time': (0.2798304557800293,
  0.223860502243042,
  0.2138676643371582,
  0.26283812522888184,
  0.22286295890808105)}

In [7]:
data.raw_ratings

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013'),
 ('62', '257', 2.0, '879372434'),
 ('286', '1014', 5.0, '879781125'),
 ('200', '222', 5.0, '876042340'),
 ('210', '40', 3.0, '891035994'),
 ('224', '29', 3.0, '888104457'),
 ('303', '785', 3.0, '879485318'),
 ('122', '387', 5.0, '879270459'),
 ('194', '274', 2.0, '879539794'),
 ('291', '1042', 4.0, '874834944'),
 ('234', '1184', 2.0, '892079237'),
 ('119', '392', 4.0, '886176814'),
 ('167', '486', 4.0, '892738452'),
 ('299', '144', 4.0, '877881320'),
 ('291', '118', 2.0, '874833878'),
 ('308', '1', 4.0, '887736532'),
 ('95', '546', 2.0, '879196566'),
 ('38', '95', 5.0, '892430094'),
 ('102', '768', 2.0, '883748450'),
 ('63', '277', 4.0, '875747401