In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import scipy
import scipy.sparse as sparse

In [3]:
train = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/train.npz")
test = scipy.sparse.load_npz("../Playground-dataset/06-Recsys-Dataset/test.npz")

train_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/train_long.csv")
test_long = pd.read_csv("../Playground-dataset/06-Recsys-Dataset/test_long.csv")
user_index = np.load("../Playground-dataset/06-Recsys-Dataset/user_index.npy")

### 建置 NCF 模型

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, concatenate, Multiply, Dense, Dropout, Add
from tensorflow.keras.optimizers import Adam

In [5]:
# 經過篩選後的 n_user, n_items
n_users = train.shape[0]
n_items = train.shape[1]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))

Number of users: 12921
Number of items: 300


In [6]:
def get_ncf_model():
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=n_users, output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=n_items, output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    # element-wise multiplication
    mf_output = Multiply()([user_hidden, item_hidden])
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(32, activation='relu')(hidden)
    # hidden = Dropout(0.2)(hidden)
    hidden = Dense(16, activation='relu')(hidden)
    mlp_output = Dense(8, activation='relu')(hidden)    

    
    output = concatenate([mf_output, mlp_output])
    output = Dense(1, activation='sigmoid')(output)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model

model = get_ncf_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 64)        826944      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 64)        19200       input_2[0][0]                    
______________________________________________________________________________________________

In [7]:
from tensorflow.keras.callbacks import EarlyStopping

# early stopping wait for 3 epoch
callbacks = [EarlyStopping(patience=3, restore_best_weights=True)]

In [8]:
train_user_ids = train_long["user_id"]
train_movie_ids = train_long["item_id"]
train_ratings = train_long["rating"]

In [9]:
val_user_ids = test_long["user_id"]
val_movie_ids = test_long["item_id"]
val_ratings = test_long["rating"]

In [10]:
# train for 50 epochs
model.fit([train_user_ids, train_movie_ids],
          train_ratings,
          validation_data=([val_user_ids, val_movie_ids], val_ratings),
          epochs=50,
          batch_size=128,
          callbacks=callbacks)

Train on 91572 samples, validate on 20672 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


<tensorflow.python.keras.callbacks.History at 0x138689490>

### 進行預測與評估

#### RMSE

In [11]:
def compute_rmse(X_test, X_pred):
    # Ref: https://github.com/ncu-dart/rdf/blob/master/rdf/utils.py
    sse = 0.
    for i in range(len(X_test)):
        sse += (X_test[i] - X_pred[i]) ** 2
    return (sse / len(X_test)) ** .5

In [12]:
# prediction & evalutation
predictions = model.predict([val_user_ids, val_movie_ids])
predictions = predictions.reshape(-1)

In [13]:
predictions[:5]

array([0.56957996, 0.42377928, 0.34324116, 0.41935128, 0.31019536],
      dtype=float32)

In [14]:
test_long["rating"]

0        0.400000
1        0.200000
2        0.200000
3        0.200000
4        0.166667
           ...   
20667    0.333333
20668    1.000000
20669    1.000000
20670    1.000000
20671    1.000000
Name: rating, Length: 20672, dtype: float64

In [15]:
compute_rmse(test_long["rating"], predictions)

0.23181205565367774

#### 使用 Top@K 來評估

In [16]:
test

<12921x300 sparse matrix of type '<class 'numpy.float64'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [17]:
predict_mat = sparse.csr_matrix(
    (predictions, (val_user_ids, val_movie_ids)), shape=(n_users, n_items)
)

In [18]:
predict_mat

<12921x300 sparse matrix of type '<class 'numpy.float32'>'
	with 20672 stored elements in Compressed Sparse Row format>

In [19]:
test_sub = test[user_index]
predict_mat_sub = predict_mat[user_index]

In [20]:
top_k = 4

predict_top_k = []
for i in range(len(user_index)):
    # 使用 csr 的 indices 和 data，都只會回傳 non-zero 的 entry 資料
    predict_r = predict_mat_sub[i].indices[predict_mat_sub[i].data.argsort()[::-1]][:top_k]
    true_r = test_sub[i].indices[test_sub[i].data.argsort()[::-1][:top_k]]
    pre = len(set(predict_r) & set(true_r))/ float(top_k)
    predict_top_k.append(pre)

np.mean(predict_top_k)

1.0