# 使用陳宏軒老師的 RDF 版本的 SVD

In [45]:
import rdf
import numpy as np
import pandas as pd
import scipy
import scipy.sparse as sparse

In [46]:
train = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/train.npz")
test = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/test.npz")

train_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/train_long.csv")
test_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/test_long.csv")
user_index = np.load("../Playground-dataset/06-Recsys-Dataset/user_index.npy")

In [48]:
train_long.head()

Unnamed: 0,user_id,item_id,rating
0,0,2,0.6
1,0,3,0.2
2,0,4,0.4
3,0,5,0.2
4,0,6,0.4


In [49]:
user_index

array([10158,  7232,  6580, ...,  7793,  2810,  1919])

### 將資料轉換成 rdf 套件可吃的格式

可吃的格式：<br/>
[(customer_id, product_id, rating), (customer_id, product_id, rating), ...]

In [50]:
ll = train_long.apply(lambda x: (x[0], x[1], x[2]), axis = 1)
X = list(ll)

In [51]:
X[:3]

[(0.0, 2.0, 0.6), (0.0, 3.0, 0.2), (0.0, 4.0, 0.4)]

In [52]:
n_users, n_items = rdf.utils.get_num_users_items(X)

In [53]:
n_users

12921

In [54]:
n_items

300

In [55]:
# 測試資料集，也進行相同轉換
ll = test_long.apply(lambda x: (x[0], x[1], x[2]), axis = 1)
X_test = list(ll)

### 初始化模型

In [56]:
model = rdf.rdfsvd.RDFSVD(n_users=n_users,
                          n_items=n_items,
                          lr=.005,
                          lmbda_p=500,
                          lmbda_q=500,
                          lmbda_u=.01,
                          lmbda_i=.01,
                          method="linear")

### 進行訓練

In [57]:
model.train(X)

After 1 epochs, training rmse=0.285906
After 2 epochs, training rmse=0.277926
After 3 epochs, training rmse=0.272663
After 4 epochs, training rmse=0.268469
After 5 epochs, training rmse=0.265034
After 6 epochs, training rmse=0.262175
After 7 epochs, training rmse=0.259768
After 8 epochs, training rmse=0.257724
After 9 epochs, training rmse=0.255975
After 10 epochs, training rmse=0.254468
After 11 epochs, training rmse=0.253164
After 12 epochs, training rmse=0.252029
After 13 epochs, training rmse=0.251039
After 14 epochs, training rmse=0.250172
After 15 epochs, training rmse=0.249410
After 16 epochs, training rmse=0.248740
After 17 epochs, training rmse=0.248148
After 18 epochs, training rmse=0.247626
After 19 epochs, training rmse=0.247163
After 20 epochs, training rmse=0.246754
After 21 epochs, training rmse=0.246390
After 22 epochs, training rmse=0.246067
After 23 epochs, training rmse=0.245779
After 24 epochs, training rmse=0.245523
After 25 epochs, training rmse=0.245295
After 26 

### 進行預測與評估

#### 使用 RMSE 來評估

In [58]:
X_pred = model.predict(X_test)

In [59]:
X_pred[:10]

[(0.0, 0.0, 0.6122254462282288),
 (0.0, 1.0, 0.5245011943953931),
 (0.0, 11.0, 0.3847501054503057),
 (0.0, 13.0, 0.460538104959841),
 (1.0, 26.0, 0.35958155752512033),
 (1.0, 30.0, 0.40467867057363544),
 (1.0, 33.0, 0.44459735310403126),
 (1.0, 34.0, 0.45005007965108024),
 (5.0, 15.0, 0.6862577631943483),
 (5.0, 46.0, 0.4951212746541251)]

In [60]:
X_test[:10]

[(0.0, 0.0, 0.4),
 (0.0, 1.0, 0.2),
 (0.0, 11.0, 0.2),
 (0.0, 13.0, 0.2),
 (1.0, 26.0, 0.16666666666666666),
 (1.0, 30.0, 0.16666666666666666),
 (1.0, 33.0, 0.16666666666666666),
 (1.0, 34.0, 0.16666666666666666),
 (5.0, 15.0, 0.6666666666666666),
 (5.0, 46.0, 0.3333333333333333)]

In [61]:
rdf.utils.compute_rmse(X_test, X_pred)

0.24568299993490805

#### 使用 Top@K 來評估

In [62]:
val_user_ids, val_movie_ids, predictions = zip(*X_pred)

predict_mat = sparse.csr_matrix(
    (predictions, (val_user_ids, val_movie_ids)), shape=(n_users, n_items)
)

In [63]:
predict_mat

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [64]:
test_sub = test[user_index]
predict_mat_sub = predict_mat[user_index]

In [65]:
top_k = 4

predict_top_k = []
for i in range(len(user_index)):
    # 使用 csr 的 indices 和 data，都只會回傳 non-zero 的 entry 資料
    predict_r = predict_mat_sub[i].indices[predict_mat_sub[i].data.argsort()[::-1]][:top_k]
    true_r = test_sub[i].indices[test_sub[i].data.argsort()[::-1][:top_k]]
    pre = len(set(predict_r) & set(true_r))/ float(top_k)
    predict_top_k.append(pre)

np.mean(predict_top_k)

1.0