In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scipy
import scipy.sparse as sparse

In [3]:
train = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/train.npz")
test = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/test.npz")

train_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/train_long.csv")
test_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/test_long.csv")
user_index = np.load("../Playground-dataset/06-Recsys-Dataset/user_index.npy")

In [47]:
# 經過篩選後的 n_user, n_items
n_users = train.shape[0]
n_items = train.shape[1]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))

Number of users: 12921
Number of items: 300


### 建模

In [None]:
# alpha: The rate in which we'll increase our confidence in a preference with more interactions.
# alpha_val = 20
# interaction_sparse = (interaction_sparse * alpha_val).astype('double')
# train = (train * alpha_val).astype('double')
# test = (test * alpha_val).astype('double')

In [4]:
from lightfm import LightFM


model = LightFM(no_components=20, learning_rate=0.005, loss='bpr', random_state=12)



In [5]:
model.get_params()

{'loss': 'bpr',
 'learning_schedule': 'adagrad',
 'no_components': 20,
 'learning_rate': 0.005,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x11D9B1D10}

In [8]:
model.fit(train, epochs=50, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49


<lightfm.lightfm.LightFM at 0x10dcd9fd0>

### 進行預測與評估

#### RMSE

In [37]:
val_user_ids = np.array(test_long["user_id"].astype(np.int32))
val_item_ids = np.array(test_long["item_id"].astype(np.int32))
val_ratings = np.array(test_long["rating"])

In [42]:
# Ref: https://github.com/lyst/lightfm/blob/9ffeacbdc4688e9b58c6e5edfdeb52b037608a6b/lightfm/lightfm.py#L784
predictions = model.predict(val_user_ids, val_item_ids)  # 請參考原始碼，predict()需要吃numpy.array格式

In [43]:
def compute_rmse(X_test, X_pred):
    # Ref: https://github.com/ncu-dart/rdf/blob/master/rdf/utils.py
    sse = 0.
    for i in range(len(X_test)):
        sse += (X_test[i] - X_pred[i]) ** 2
    return (sse / len(X_test)) ** .5

In [44]:
compute_rmse(val_ratings, predictions)

0.7219049523764558

#### 使用 Top@K 來評估

In [45]:
test

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [48]:
predict_mat = sparse.csr_matrix(
    (predictions, (val_user_ids, val_item_ids)), shape=(n_users, n_items)
)

In [49]:
predict_mat

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [50]:
test_sub = test[user_index]
predict_mat_sub = predict_mat[user_index]

In [51]:
top_k = 4

predict_top_k = []
for i in range(len(user_index)):
    # 使用 csr 的 indices 和 data，都只會回傳 non-zero 的 entry 資料
    predict_r = predict_mat_sub[i].indices[predict_mat_sub[i].data.argsort()[::-1]][:top_k]
    true_r = test_sub[i].indices[test_sub[i].data.argsort()[::-1][:top_k]]
    pre = len(set(predict_r) & set(true_r))/ float(top_k)
    predict_top_k.append(pre)

np.mean(predict_top_k)

1.0