# 实验二 基于奇异值分解的推荐算法

## 1 数据预处理

In [73]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [None]:
# 读取数据
movies = pd.read_csv("movies.dat", sep="::", header=None, names=['MovieID','Title','Genres'])
users = pd.read_csv("users.dat", sep="::", header=None, names=['UserID','Gender','Age','Occupation','Zip-code'])
ratings = pd.read_csv("ratings.dat", sep="::", header=None, names=['UserID','MovieID','Rating','Timestamp'])

In [7]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [8]:
users

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [9]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [26]:
A = []

In [27]:
def preprocess(x):
    if x['Rating'] != 0:
        A.append([x['UserID'], x['MovieID'], x['Rating']])

In [28]:
ratings.apply(lambda x:preprocess(x), axis=1)

0          None
1          None
2          None
3          None
4          None
           ... 
1000204    None
1000205    None
1000206    None
1000207    None
1000208    None
Length: 1000209, dtype: object

## 2 数据集划分

In [44]:
# 对数据集A打乱后分割，等效于无放回抽取

random.shuffle(A)
train_set = A[:int(len(A)*0.8)]
test_set = A[int(len(A)*0.8):int(len(A)*0.9)]
val_set = A[int(len(A)*0.9):]

In [48]:
len(train_set)

800167

In [49]:
len(test_set)

100021

In [50]:
len(val_set)

100021

## 3 基于SVD的推荐算法

In [94]:
import numpy as np
import random


def build_mat(k):
    # 用于建立pu，qi的函数
    return np.random.uniform(0,1,(k,1))


# mat 格式: [[uid, iid, rate],...]
class SVD:
    def __init__(self, K=20):
        self.K = K
        # 使用字典结构存储p q
        self.p = {}
        self.q = {}
            
    
    def predict(self, uid, iid):
        self.p.setdefault(uid,build_mat(self.K))
        self.q.setdefault(iid,build_mat(self.K))
        # 预测评分
        rate = np.sum(self.p[uid] * self.q[iid])
        if rate > 5:
            rate = 5
        if rate < 1:
            rate = 1
        return rate
    
    
    def train(self, train_data, lr=0.15, epoch=30):
        mat = np.array(train_data)
        data_size = mat.shape
        list_rmse = []
        print('Train data size', data_size, ', K =', self.K)
        for e in range(epoch):
            print('Epoch', e+1, ':', end=' ')
            rmse = 0.0
            for i in range(data_size[0]):
                uid = mat[i,0]
                iid = mat[i,1]
                rate = mat[i,2]
                eui = rate - self.predict(uid, iid)
                rmse += eui**2
                temp = self.p[uid]
                self.p[uid] += lr*eui*self.q[iid]
                self.q[iid] += lr*eui*temp
            print('RMSE is', np.sqrt(rmse/data_size[0]))
            list_rmse.append(np.sqrt(rmse/data_size[0]))
        return list_rmse
    
    
    def test(self, test_data):
        mat = np.array(test_data)
        data_size = mat.shape
        print('Test data size', data_size)
        rmse=0.0
        for i in range(data_size[0]):
            uid = mat[i,0]
            iid = mat[i,1]
            rate = mat[i,2]
            eui = rate - self.predict(uid, iid)
            rmse += eui**2
        print('RMSE is',np.sqrt(rmse/data_size[0]))

        

In [95]:
def draw(y, K, lr):
    x = range(len(y))
    plt.plot(x,y)
    title = 'K=' + str(K) + ', lr=' + str(lr)
    plt.title(title)
    plt.show()

## 4 算法调参

In [96]:
# K=10, lr=0.01
svd_1 = SVD(10)
list1 = svd_1.train(train_set, 0.01)

Train data size (800167, 3) , K = 10
Epoch 1 : RMSE is 1.0036747357818296
Epoch 2 : RMSE is 0.9280968771389356
Epoch 3 : RMSE is 0.9181640824005036
Epoch 4 : RMSE is 0.9099507001007817
Epoch 5 : RMSE is 0.9001184529202018
Epoch 6 : RMSE is 0.8896998428466312
Epoch 7 : RMSE is 0.8800783432532298
Epoch 8 : RMSE is 0.8712912311022255
Epoch 9 : RMSE is 0.8631562245021301
Epoch 10 : RMSE is 0.85564869326436
Epoch 11 : RMSE is 0.8488170201395241
Epoch 12 : RMSE is 0.8426902793808299
Epoch 13 : RMSE is 0.8372501792992892
Epoch 14 : RMSE is 0.8324474883513829
Epoch 15 : RMSE is 0.828218232141755
Epoch 16 : RMSE is 0.8244922773629577
Epoch 17 : RMSE is 0.8212049699973216
Epoch 18 : RMSE is 0.8182967967910869
Epoch 19 : RMSE is 0.8157161367533696
Epoch 20 : RMSE is 0.8134219192457515
Epoch 21 : RMSE is 0.8113793017194528
Epoch 22 : RMSE is 0.8095540220849914
Epoch 23 : RMSE is 0.8079227483093567
Epoch 24 : RMSE is 0.8064629816223624
Epoch 25 : RMSE is 0.8051574789567022
Epoch 26 : RMSE is 0.8039

In [100]:
# K=20, lr=0.01
svd_2 = SVD(20)
list2 = svd_2.train(train_set, 0.01)

Train data size (800167, 3) , K = 20
Epoch 1 : RMSE is 1.0394005559400168
Epoch 2 : RMSE is 0.9349436133525635
Epoch 3 : RMSE is 0.9142963126142188
Epoch 4 : RMSE is 0.8984071266169552
Epoch 5 : RMSE is 0.8822959111096491
Epoch 6 : RMSE is 0.8667816584761195
Epoch 7 : RMSE is 0.8524664400693358
Epoch 8 : RMSE is 0.8393884466683902
Epoch 9 : RMSE is 0.827540648572773
Epoch 10 : RMSE is 0.8169221614325733
Epoch 11 : RMSE is 0.8074705638991208
Epoch 12 : RMSE is 0.7990823681037481
Epoch 13 : RMSE is 0.7916403725719935
Epoch 14 : RMSE is 0.7850291661901669
Epoch 15 : RMSE is 0.7791417880149908
Epoch 16 : RMSE is 0.7738826619522615
Epoch 17 : RMSE is 0.7691709855383284
Epoch 18 : RMSE is 0.7649370239513701
Epoch 19 : RMSE is 0.7611214588652937
Epoch 20 : RMSE is 0.7576727625142395
Epoch 21 : RMSE is 0.754546153592956
Epoch 22 : RMSE is 0.7517070233037723
Epoch 23 : RMSE is 0.749122585453588
Epoch 24 : RMSE is 0.746765362332826
Epoch 25 : RMSE is 0.7446142553445981
Epoch 26 : RMSE is 0.74264

In [102]:
# K=30, lr=0.01
svd_3 = SVD(30)
list3 = svd_3.train(train_set, 0.01)

Train data size (800167, 3) , K = 30
Epoch 1 : RMSE is 1.1431147162516517
Epoch 2 : RMSE is 0.9515319585156793
Epoch 3 : RMSE is 0.9149089905477316
Epoch 4 : RMSE is 0.8908594818527249
Epoch 5 : RMSE is 0.8685380960079736
Epoch 6 : RMSE is 0.8477832847368021
Epoch 7 : RMSE is 0.8293218461013956
Epoch 8 : RMSE is 0.8130178085545315
Epoch 9 : RMSE is 0.798591021211643
Epoch 10 : RMSE is 0.785808431916792
Epoch 11 : RMSE is 0.77445117904542
Epoch 12 : RMSE is 0.7643405479313298
Epoch 13 : RMSE is 0.7553112133949625
Epoch 14 : RMSE is 0.7472199934280851
Epoch 15 : RMSE is 0.7399463226904194
Epoch 16 : RMSE is 0.7333898285992922
Epoch 17 : RMSE is 0.7274624563153118
Epoch 18 : RMSE is 0.7220911750875496
Epoch 19 : RMSE is 0.7172105159829576
Epoch 20 : RMSE is 0.7127636032678087
Epoch 21 : RMSE is 0.7087007134140373
Epoch 22 : RMSE is 0.7049787277581984
Epoch 23 : RMSE is 0.7015647005377645
Epoch 24 : RMSE is 0.6984278649371354
Epoch 25 : RMSE is 0.6955408807902712
Epoch 26 : RMSE is 0.69287

In [104]:
# K=40, lr=0.01
svd_4 = SVD(40)
list4 = svd_4.train(train_set, 0.01)

Train data size (800167, 3) , K = 40
Epoch 1 : RMSE is 1.230847129733242
Epoch 2 : RMSE is 0.9741039500440023
Epoch 3 : RMSE is 0.9174123484771943
Epoch 4 : RMSE is 0.8830989076823391
Epoch 5 : RMSE is 0.8546998487144524
Epoch 6 : RMSE is 0.829896949935248
Epoch 7 : RMSE is 0.808184756158512
Epoch 8 : RMSE is 0.7890910843577512
Epoch 9 : RMSE is 0.7722283447507422
Epoch 10 : RMSE is 0.7572821731134644
Epoch 11 : RMSE is 0.7439941760265281
Epoch 12 : RMSE is 0.7321491637591362
Epoch 13 : RMSE is 0.7215595077005938
Epoch 14 : RMSE is 0.7120629366571993
Epoch 15 : RMSE is 0.7035223725835786
Epoch 16 : RMSE is 0.6958181631870018
Epoch 17 : RMSE is 0.688846893014976
Epoch 18 : RMSE is 0.6825172524856649
Epoch 19 : RMSE is 0.6767501684377044
Epoch 20 : RMSE is 0.6714794131110439
Epoch 21 : RMSE is 0.6666526332135816
Epoch 22 : RMSE is 0.6622219111720684
Epoch 23 : RMSE is 0.6581463243459612
Epoch 24 : RMSE is 0.6543847061464818
Epoch 25 : RMSE is 0.6509078050250208
Epoch 26 : RMSE is 0.64768

In [107]:
# K=40, lr=0.02
svd_5 = SVD(40)
list5 = svd_5.train(train_set, 0.02)

Train data size (800167, 3) , K = 40
Epoch 1 : RMSE is 1.1564076581545188
Epoch 2 : RMSE is 0.9339130291412165
Epoch 3 : RMSE is 0.8732466098202804
Epoch 4 : RMSE is 0.8310503762092067
Epoch 5 : RMSE is 0.8002818919173769
Epoch 6 : RMSE is 0.7777795104092066
Epoch 7 : RMSE is 0.7617851438210562
Epoch 8 : RMSE is 0.7512338971187028
Epoch 9 : RMSE is 0.7456758211146208
Epoch 10 : RMSE is 0.7454056594537524
Epoch 11 : RMSE is 0.7526353829260876
Epoch 12 : RMSE is 0.778006101473576
Epoch 13 : RMSE is 0.8996010236228626
Epoch 14 : RMSE is 1.488891212443432
Epoch 15 : RMSE is 2.0960361571656727
Epoch 16 : RMSE is 2.1840295780422694
Epoch 17 : RMSE is 2.190895763138646
Epoch 18 : RMSE is 2.200785127796769
Epoch 19 : RMSE is 2.21197985220178
Epoch 20 : RMSE is 2.2319467609803603
Epoch 21 : RMSE is 2.253188064990307
Epoch 22 : RMSE is 2.2605639796941297
Epoch 23 : RMSE is 2.258369229479177
Epoch 24 : RMSE is 2.2633728931910015
Epoch 25 : RMSE is 2.2640044736553433
Epoch 26 : RMSE is 2.267589659