[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Wp-Zhang/HandyRec/blob/master/handyrec/examples/YouTubeDNN.ipynb)

> This notebook runs YouTubeDNN including candidate generation and ranking on MovieLens1M dataset.

## Table of Contents:
* [Prepare data for matching](#section-0)
* [Train match model and export embeddings](#section-1)
* [Use Faiss to generate candidates](#section-2)
* [Train rank model and predict](#section-3)

**Download dataset and install packages**

In [None]:
! git clone https://github.com/Wp-Zhang/HandyRec.git
! pip install faiss-cpu

Cloning into 'HandyRec'...
remote: Enumerating objects: 192, done.[K
remote: Counting objects: 100% (192/192), done.[K
remote: Compressing objects: 100% (146/146), done.[K
remote: Total 192 (delta 76), reused 128 (delta 29), pack-reused 0[K
Receiving objects: 100% (192/192), 229.71 KiB | 1.54 MiB/s, done.
Resolving deltas: 100% (76/76), done.
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 13.2 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2


In [None]:
! wget http://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip
! unzip -o ml-1m.zip

--2022-03-13 14:51:04--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘./ml-1m.zip’


2022-03-13 14:51:05 (6.84 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


**Import relative packages**

In [None]:
import sys
sys.path.append('./HandyRec/')

In [None]:
from handyrec.examples.utils import MatchDataHelper, RankDataHelper, sampledsoftmaxloss
from handyrec.models.match import YouTubeMatchDNN
from handyrec.models.rank import YouTubeRankDNN
from handyrec.features import DenseFeature, SparseFeature, SparseSeqFeature
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.losses import binary_crossentropy
import numpy as np
import pandas as pd
import gc
import faiss
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """Calculate map@k"""
    return np.mean(
        [apk(a, p, k) for a, p in zip(actual, predicted)]
    )


def rk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = sum([1 for r in actual if r in predicted])/len(actual)

    return score

def recall_at_k(actual, predicted, k=12):
    """Calculate mean recall@k"""
    return np.mean(
        [rk(a, p, k) for a, p in zip(actual, predicted)]
    )

# 0. Prepare data for matching<a name="section-0"></a>

In [None]:
MATCH_EMBEDDING_DIM = 256
RANK_EMBEDDING_DIM = 256
SEQ_LEN = 40
BATCH_SIZE = 2**12
NEPOCH = 100

NEG_NUM = 10
CANDIDATE_NUM = 100

In [None]:
match_dh = MatchDataHelper('./ml-1m/')
match_features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
lbd_features = ['gender','age','occupation', 'zip']
data = match_dh.preprocess_data(lbd_features)

Encode User Sparse Feats: 100%|██████████| 4/4 [00:00<00:00, 55.85it/s]
Encode Item Sparse Feats: 0it [00:00, ?it/s]


In [None]:
match_dh.gen_data_set(match_features, data, seq_max_len=SEQ_LEN)

Generate train set: 100%|██████████| 6040/6040 [00:12<00:00, 477.00it/s] 
100%|██████████| 4/4 [00:00<00:00,  6.62it/s]
0it [00:00, ?it/s]


In [None]:
match_user_features = ['user_id','gender','age','occupation', 'zip']
match_movie_features = ['movie_id']
match_train, match_train_label, match_test, match_test_label = match_dh.load_dataset(match_user_features, match_movie_features)

Load user Features: 100%|██████████| 8/8 [00:00<00:00, 141.50it/s]
Load movie Features: 100%|██████████| 1/1 [00:00<00:00, 389.08it/s]


# 1. Train match model and export embeddings <a name="section-1"></a>

In [None]:
# Standarize dense feature
stds = StandardScaler()
match_train['example_age'] = stds.fit_transform(match_train['example_age'].reshape(-1,1)).reshape(-1)
match_test['example_age'] = stds.transform(match_test['example_age'].reshape(-1,1)).reshape(-1)
# add example_age^2 as showed in the original paper
match_train['example_age_2'] = match_train['example_age']**2
match_test['example_age_2'] = match_test['example_age']**2

In [None]:
match_feature_dim = {}
for feat in match_user_features:
    match_feature_dim[feat] = data['user'][feat].max()+1
for feat in match_movie_features:
    match_feature_dim[feat] = data['item'][feat].max()+1

In [None]:
match_user_dense_feats = ['example_age','example_age_2']
match_user_sparse_feats = ['user_id','gender','age','occupation', 'zip']

In [None]:
match_user_features = [SparseFeature(x, match_feature_dim[x], MATCH_EMBEDDING_DIM) for x in match_user_sparse_feats] +\
                [DenseFeature(x) for x in match_user_dense_feats] +\
                [SparseSeqFeature(SparseFeature('movie_id', match_feature_dim['movie_id'], MATCH_EMBEDDING_DIM), 'hist_movie_id',SEQ_LEN)]
match_item_id = SparseFeature('movie_id', match_feature_dim['movie_id'], MATCH_EMBEDDING_DIM)

In [None]:
tf.compat.v1.disable_eager_execution()

In [None]:
match_model = YouTubeMatchDNN(
    match_user_features, match_item_id, num_sampled=100, 
    user_dnn_hidden_units=(1024,512,MATCH_EMBEDDING_DIM), dnn_dropout=0.2
)

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
match_model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=sampledsoftmaxloss)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='./match_checkpoint/',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)
history = match_model.fit(match_train, match_train_label,
                            batch_size=BATCH_SIZE, 
                            epochs=NEPOCH,
                            verbose=1,
                            validation_split=0.1,
                            callbacks=[early_stop,checkpoint])
# model.load_weights('youtubematch.h5')

Train on 845828 samples, validate on 93981 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


In [None]:
# model.load_weights('youtubematch.h5')
match_model.load_weights('./match_checkpoint/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0bc86985d0>

In [None]:
all_item_model_input = {"movie_id": data['item']['movie_id'].values}

user_embedding_model = Model(inputs=match_model.user_input, outputs=match_model.user_embedding)
item_embedding_model = Model(inputs=match_model.item_input, outputs=match_model.item_embedding)

user_embs = user_embedding_model.predict(match_test, batch_size=2 ** 15)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 15)

print(user_embs.shape)
print(item_embs.shape)

(6040, 256)
(3883, 256)


# 2. Use Faiss to generate candidates <a name="section-2"></a>

## Test match model

In [None]:
index = faiss.IndexFlatIP(MATCH_EMBEDDING_DIM)
index.add(item_embs)

In [None]:
D, I = index.search(np.ascontiguousarray(user_embs), CANDIDATE_NUM)
candidates = []
for i, uid in enumerate(match_test['user_id']):
    pred = data['item']['movie_id'].values[I[i]].tolist()
    candidates.append(pred)
candidates = np.array(candidates)

In [None]:
mapk(match_test_label, candidates, k=10)

0.035381523967202774

In [None]:
recall_at_k(match_test_label, candidates, k=10)

0.08612582781456954

In [None]:
recall_at_k(match_test_label, candidates, k=100)

0.4718377483443708

## Prepare data for ranking

In [None]:
test_user_embs = user_embedding_model.predict(match_test, batch_size=2 ** 15)
index = faiss.IndexFlatIP(MATCH_EMBEDDING_DIM)
index.add(item_embs)
D, I = index.search(np.ascontiguousarray(test_user_embs), 100)
test_candidates = {}
for i, uid in tqdm(enumerate(match_test['user_id'])):
    pred = data['item']['movie_id'].values[I[i]].tolist()
    pred = [x for x in pred if x!=match_train_label[i]]
    pred = pred[:NEG_NUM]
    test_candidates[uid] = pred

6040it [00:00, 28015.89it/s]


In [None]:
del user_embs, item_embs, match_train, match_train_label, test_user_embs
gc.collect()

50

In [None]:
rank_dh = RankDataHelper('./ml-1m/')
rank_features = ['user_id', 'gender', 'age', 'occupation', 'zip'] +\
           [f for f in data['item'].columns if f != 'title']

In [None]:
rank_dh.gen_data_set(rank_features, data, test_candidates, seq_max_len=SEQ_LEN, negnum=NEG_NUM)

Generate train set: 100%|██████████| 6040/6040 [00:36<00:00, 167.35it/s]
100%|██████████| 4/4 [00:01<00:00,  2.65it/s]
100%|██████████| 18/18 [00:07<00:00,  2.43it/s]


In [None]:
rank_user_features = ['user_id','gender','age','occupation', 'zip']
rank_movie_features = [f for f in data['item'].columns if f != 'title']
rank_train, rank_train_label, rank_test = rank_dh.load_dataset(rank_user_features, rank_movie_features)

Load user Features: 100%|██████████| 9/9 [00:14<00:00,  1.62s/it]
Load movie Features: 100%|██████████| 19/19 [00:06<00:00,  3.03it/s]


In [None]:
stds = StandardScaler()
rank_train['time_gap'] = stds.fit_transform(rank_train['time_gap'].reshape(-1,1)).reshape(-1)
rank_test['time_gap'] = stds.fit_transform(rank_test['time_gap'].reshape(-1,1)).reshape(-1)
rank_train['time_gap_2'] = rank_train['time_gap']**2
rank_train['time_gap_square'] = np.sqrt(rank_train['time_gap'])
rank_test['time_gap_2'] = rank_test['time_gap']**2
rank_test['time_gap_square'] = np.sqrt(rank_test['time_gap'])

In [None]:
stds2 = StandardScaler()
rank_train['example_age'] = stds2.fit_transform(rank_train['example_age'].reshape(-1,1)).reshape(-1)
rank_test['example_age'] = stds2.fit_transform(rank_test['example_age'].reshape(-1,1)).reshape(-1)
rank_train['example_age_2'] = rank_train['example_age']**2
rank_test['example_age_2'] = rank_test['example_age']**2

In [None]:
rank_feature_dim = {}
for feat in rank_user_features:
    rank_feature_dim[feat] = data['user'][feat].max()+1
for feat in rank_movie_features:
    rank_feature_dim[feat] = data['item'][feat].max()+1

# 3. Train rank model and predict <a name="section-3"></a>

In [None]:
rank_user_dense_feats = ['time_gap','time_gap_2','time_gap_square','example_age','example_age_2']
rank_user_sparse_feats = ['user_id','gender','age','occupation', 'zip']
rank_item_dense_feats = [f for f in rank_movie_features if f !='movie_id']
rank_item_sparse_feats = ['movie_id']

In [None]:
# change the improper feature name so it can be the name of a tf component
for i, k in enumerate(rank_item_dense_feats):
    if k == "Children's":
        rank_item_dense_feats[i] = 'Children'
rank_train['Children'] = rank_train.pop("Children's")
rank_test['Children'] = rank_test.pop("Children's")

In [None]:
rank_user_features = [SparseFeature(x, rank_feature_dim[x], RANK_EMBEDDING_DIM) for x in rank_user_sparse_feats] +\
                [DenseFeature(x) for x in rank_user_dense_feats] +\
                [SparseSeqFeature(SparseFeature('movie_id', rank_feature_dim['movie_id'], RANK_EMBEDDING_DIM), 'hist_movie_id',SEQ_LEN)]
rank_item_feats = [SparseFeature(x, rank_feature_dim[x], RANK_EMBEDDING_DIM) for x in rank_item_sparse_feats] +\
                [DenseFeature(x) for x in rank_item_dense_feats]

In [None]:
tf.compat.v1.disable_eager_execution()

In [None]:
rank_model = YouTubeRankDNN(
    rank_user_features, rank_item_feats, 
    dnn_hidden_units=(512,RANK_EMBEDDING_DIM), dnn_dropout=0.2
)

In [38]:
rank_model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=binary_crossentropy)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
# checkpoint = tf.keras.callbacks.ModelCheckpoint(
#     filepath='./rank_checkpoint/',
#     save_weights_only=True,
#     monitor='val_loss',
#     mode='min',
#     save_best_only=True)
history = rank_model.fit(rank_train, rank_train_label,
                    batch_size=BATCH_SIZE*16, 
                    epochs=NEPOCH//2,
                    verbose=1,
                    validation_split=0.15,
                    callbacks=[early_stop])
rank_model.save_weights('youtuberank.h5')

Train on 8787214 samples, validate on 1550685 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [39]:
rank_model.load_weights('youtuberank.h5')

In [40]:
del rank_train
gc.collect()

8

In [41]:
pred = rank_model.predict(rank_test)

In [42]:
pred_df = pd.DataFrame(columns=['user_id','movie_id','pred'])
pred_df['user_id'] = rank_test['user_id']
pred_df['movie_id'] = rank_test['movie_id']
pred_df['pred'] = pred

pred_df = pred_df.sort_values(by=['user_id','pred'], ascending=False).reset_index(drop=True)
pred_df = pred_df.groupby('user_id')['movie_id'].apply(list).reset_index()

In [43]:
test_label_df = pd.DataFrame(columns=['user_id','label'])
test_label_df['user_id'] = match_test['user_id']
test_label_df['label'] = match_test_label.tolist()

In [44]:
test_label_df = pd.merge(test_label_df, pred_df, on=['user_id'], how='left')

In [45]:
mapk(test_label_df['label'], test_label_df['movie_id'], k=10)

0.03510277383580364

In [46]:
recall_at_k(test_label_df['label'], test_label_df['movie_id'], k=10)

0.08602649006622518