In [2]:
from handyrec.examples.utils import DataProcessor

In [52]:
from handyrec.models.match import YouTubeDNN
from handyrec.features import DenseFeature, SparseFeature, SparseSeqFeature
from tensorflow.keras import Model
import numpy as np
import faiss

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
EMBEDDING_DIM = 128
SEQ_LEN = 40
BATCH_SIZE = 2**12
NEPOCH = 100

In [6]:
dp = DataProcessor('./ml-1m/')

In [7]:
features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
lbd_features = ['gender','age','occupation', 'zip']
data = dp.preprocess_data(lbd_features)
dp.gen_data_set(features, data, seq_max_len=SEQ_LEN, negnum=0)

Encode User Sparse Feats: 100%|██████████| 4/4 [00:00<00:00, 235.30it/s]
Encode Item Sparse Feats: 0it [00:00, ?it/s]
Generate train set: 100%|██████████| 6040/6040 [00:08<00:00, 710.82it/s] 
100%|██████████| 4/4 [00:00<00:00,  8.07it/s]
0it [00:00, ?it/s]


In [8]:
user_features = ['user_id','gender','age','occupation', 'zip']
movie_features = ['movie_id']
train, train_label, test, test_label = dp.load_dataset(user_features, movie_features)

Load user Features: 100%|██████████| 7/7 [00:00<00:00, 79.05it/s]
Load movie Features: 100%|██████████| 1/1 [00:00<00:00, 500.22it/s]


In [9]:
feature_dim = {}
for feat in user_features:
    feature_dim[feat] = data['user'][feat].max()+1
for feat in movie_features:
    feature_dim[feat] = data['item'][feat].max()+1

In [10]:
user_dense_feats = []
user_sparse_feats = ['user_id','gender','age','occupation', 'zip']

In [11]:
user_features = [SparseFeature(x, feature_dim[x], EMBEDDING_DIM) for x in user_sparse_feats] +\
                [DenseFeature(x) for x in user_dense_feats] +\
                [SparseSeqFeature(SparseFeature('movie_id', feature_dim['movie_id'], EMBEDDING_DIM), 'hist_movie_id',SEQ_LEN)]
item_id = SparseFeature('movie_id', feature_dim['movie_id'], EMBEDDING_DIM)

In [12]:
import tensorflow.keras.backend as K
def sampledsoftmaxloss(y_true, y_pred):
    return K.mean(y_pred)

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [56]:
model = YouTubeDNN(
    user_features, item_id, num_sampled=5, 
    user_dnn_hidden_units=(256,128,EMBEDDING_DIM), dnn_dropout=0.2
)

In [57]:
model.compile(optimizer="Adam", loss=sampledsoftmaxloss)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
history = model.fit(train, train_label,
                    batch_size=BATCH_SIZE, 
                    epochs=NEPOCH,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stop])
model.save_weights('youtubednn.h5')

Train on 845828 samples, validate on 93981 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/

In [58]:
model.load_weights('youtubednn.h5')

In [91]:
# 4. Generate user features for testing and full item features for retrieval
all_item_model_input = {"movie_id": data['item']['movie_id'].values}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test, batch_size=2 ** 12)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

(6040, 128)
(3883, 128)


In [92]:
index = faiss.IndexFlatIP(EMBEDDING_DIM)
index.add(item_embs)

In [93]:
D, I = index.search(np.ascontiguousarray(user_embs), 10)
s = []
for i, uid in enumerate(test['user_id']):
    try:
        pred = data['item']['movie_id'].values[I[i]].tolist()
        s.append(pred)
    except:
        print(i)
s = np.array(s)

In [94]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    return np.mean(
        [apk(a, p, k) for a, p in zip(actual, predicted)]
    )


def rk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = sum([1 for r in actual if r in predicted])/len(actual)

    return score

def recall_at_k(actual, predicted, k=12):
    return np.mean(
        [rk(a, p, k) for a, p in zip(actual, predicted)]
    )  # CHANGES: ignore null actual (variable=a)

In [95]:
mapk(test_label, s, k=10)

0.01860089482812993

In [96]:
recall_at_k(test_label, s, k=10)

0.050711920529801315