[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Wp-Zhang/HandyRec/blob/master/examples/DeepFM.ipynb)

> This notebook runs DeepFM on MovieLens1M dataset. We'll use YouTubeDNN for generating candidates and DeepFM for ranking these candidates.

> Only movies with ratings larger than 3 are treated as 'positive' samples for each user. Every last 10 'positive' movies of each user are held out for testing.

## Table of Contents:
* [Prepare data for matching](#section-0)
* [Train match model and export embeddings](#section-1)
* [Use Faiss to generate candidates](#section-2)
* [Train rank model and predict](#section-3)

**Download dataset and install packages**

In [1]:
! git clone https://github.com/Wp-Zhang/HandyRec.git
! pip install faiss-cpu

fatal: destination path 'HandyRec' already exists and is not an empty directory.


In [2]:
! wget https://files.grouplens.org/datasets/movielens/ml-1m.zip -O ./ml-1m.zip
! unzip -o ml-1m.zip

--2022-03-20 01:19:26--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘./ml-1m.zip’


2022-03-20 01:19:28 (4.17 MB/s) - ‘./ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


**Import relative packages**

In [3]:
import sys
sys.path.append('./HandyRec/')

In [4]:
from handyrec.dataset.movielens import MovieMatchDataHelper, MovieRankDataHelper
from handyrec.models.match import YouTubeMatchDNN
from handyrec.models.rank import DeepFM
from handyrec.features import DenseFeature, SparseFeature, SparseSeqFeature
from handyrec.layers.utils import sampledsoftmaxloss
from handyrec.dataset.metrics import map_at_k, recall_at_k
from handyrec.models.utils import search_embedding

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.losses import binary_crossentropy
import numpy as np
import pandas as pd
import gc

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
MATCH_EMBEDDING_DIM = 256
RANK_EMBEDDING_DIM = 256
SEQ_LEN = 40
BATCH_SIZE = 2**12
NEPOCH = 50

NEG_NUM = 10
CANDIDATE_NUM = 100

In [7]:
%load_ext tensorboard

# 0. Prepare data for ranking<a name="section-0"></a>

In [8]:
match_dh = MovieMatchDataHelper('./ml-1m/')
match_user_features = ['user_id','gender','age','occupation', 'zip']
match_movie_features = ['movie_id']

data = match_dh.get_clean_data(sparse_features=['gender','age','occupation', 'zip'])
match_dh.gen_dataset(match_user_features+match_movie_features, data, seq_max_len=SEQ_LEN)

Encode User Sparse Feats: 100%|██████████| 4/4 [00:00<00:00, 187.11it/s]
Encode Item Sparse Feats: 0it [00:00, ?it/s]


In [9]:
match_train, match_train_label, match_test, match_test_label = match_dh.load_dataset(match_user_features, match_movie_features)

Load user Features: 100%|██████████| 7/7 [00:00<00:00, 98.87it/s]
Load movie Features: 100%|██████████| 1/1 [00:00<00:00, 212.96it/s]


In [10]:
match_feature_dim = match_dh.get_feature_dim(data, match_user_features, match_movie_features, [])

# 1. Train match model and export embeddings <a name="section-1"></a>

In [11]:
# * add example_age^2 as showed in the original paper
match_train['example_age_2'] = match_train['example_age']**2
match_test['example_age_2'] = match_test['example_age']**2

In [12]:
match_user_dense_feats = ['example_age','example_age_2']
match_user_sparse_feats = ['user_id','gender','age','occupation', 'zip']

In [13]:
match_user_features = [SparseFeature(x, match_feature_dim[x], MATCH_EMBEDDING_DIM) for x in match_user_sparse_feats] +\
                [DenseFeature(x) for x in match_user_dense_feats] +\
                [SparseSeqFeature(SparseFeature('movie_id', match_feature_dim['movie_id'], MATCH_EMBEDDING_DIM), 'hist_movie_id',SEQ_LEN)]
match_item_id = SparseFeature('movie_id', match_feature_dim['movie_id'], MATCH_EMBEDDING_DIM)

In [14]:
match_model = YouTubeMatchDNN(
    match_user_features, match_item_id, num_sampled=100, 
    user_dnn_hidden_units=(1024,512,MATCH_EMBEDDING_DIM), dnn_dropout=0.2
)

In [15]:
match_model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-3), loss=sampledsoftmaxloss)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='./match_checkpoint/',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)
history = match_model.fit(match_train, match_train_label,
                            batch_size=BATCH_SIZE, 
                            epochs=NEPOCH,
                            verbose=1,
                            validation_split=0.1,
                            callbacks=[early_stop,checkpoint])

In [16]:
match_model.load_weights('./match_checkpoint/')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7feed9114c90>

In [17]:
all_item_model_input = {"movie_id": data['item']['movie_id'].values}

user_embedding_model = Model(inputs=match_model.user_input, outputs=match_model.user_embedding)
item_embedding_model = Model(inputs=match_model.item_input, outputs=match_model.item_embedding)

user_embs = user_embedding_model.predict(match_test, batch_size=2 ** 15)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 15)

print(user_embs.shape)
print(item_embs.shape)

(6040, 256)
(3883, 256)


# 2. Use Faiss to generate candidates <a name="section-2"></a>

## Test match model

In [18]:
candidates = search_embedding(
    MATCH_EMBEDDING_DIM, 
    item_embs, 
    user_embs,
    data['item']['movie_id'].values,
    CANDIDATE_NUM)

In [19]:
map_at_k(match_test_label, candidates, k=10)

0.0354114041311889

In [20]:
recall_at_k(match_test_label, candidates, k=10)

0.08846026490066225

In [21]:
recall_at_k(match_test_label, candidates, k=100)

0.4773344370860928

## Prepare data for ranking

In [22]:
test_user_embs = user_embedding_model.predict(match_test, batch_size=2 ** 15)
test_candidates = search_embedding(
    MATCH_EMBEDDING_DIM, 
    item_embs, 
    test_user_embs,
    data['item']['movie_id'].values,
    CANDIDATE_NUM)

test_candidates = {
    match_test['user_id'][i] : test_candidates[i]
    for i in range(test_candidates.shape[0])
}

In [23]:
del user_embs, item_embs, match_train, match_train_label, test_user_embs
gc.collect()

2159

In [24]:
rank_dh = MovieRankDataHelper('./ml-1m/')
rank_user_features = ['user_id','gender','age','occupation', 'zip']
rank_movie_features = [f for f in data['item'].columns if f != 'title']

rank_dh.gen_dataset(rank_user_features+rank_movie_features, data, test_candidates, seq_max_len=SEQ_LEN, negnum=NEG_NUM)

In [25]:
rank_train, rank_train_label, rank_test = rank_dh.load_dataset(rank_user_features, rank_movie_features)

Load user Features: 100%|██████████| 8/8 [00:01<00:00,  6.11it/s]
Load movie Features: 100%|██████████| 19/19 [00:00<00:00, 32.50it/s]


In [26]:
# * change the improper feature name so it can be the name of a tf component
for i, k in enumerate(rank_movie_features):
    if k == "Children's":
        rank_movie_features[i] = 'Children'
rank_train['Children'] = rank_train.pop("Children's")
rank_test['Children'] = rank_test.pop("Children's")
data['item']['Children'] = data['item'].pop("Children's")

In [27]:
rank_train['example_age_2'] = rank_train['example_age']**2
rank_test['example_age_2'] = rank_test['example_age']**2
rank_train['time_gap_2'] = rank_train['time_gap']**2
rank_train['time_gap_square'] = np.sqrt(rank_train['time_gap'])
rank_test['time_gap_2'] = rank_test['time_gap']**2
rank_test['time_gap_square'] = np.sqrt(rank_test['time_gap'])

In [28]:
rank_feature_dim = rank_dh.get_feature_dim(data, rank_user_features, rank_movie_features, [])

# 3. Train rank model and predict <a name="section-3"></a>

In [29]:
rank_user_dense_feats = ['example_age','example_age_2'] # 'time_gap','time_gap_2','time_gap_square',
rank_user_sparse_feats = ['user_id','gender','age','occupation', 'zip']
rank_item_dense_feats = [f for f in rank_movie_features if f != 'movie_id']
rank_item_sparse_feats = ['movie_id']

In [30]:
rank_dense_feats = rank_user_dense_feats + rank_item_sparse_feats
rank_sparse_feats = rank_user_sparse_feats + rank_item_sparse_feats

In [31]:
rank_fm_features = [SparseFeature(x, rank_feature_dim[x], RANK_EMBEDDING_DIM) for x in rank_sparse_feats] +\
                   [SparseSeqFeature(SparseFeature('movie_id', rank_feature_dim['movie_id'], RANK_EMBEDDING_DIM), 'hist_movie_id', SEQ_LEN)]
rank_dnn_feats = [DenseFeature(x) for x in rank_dense_feats] + rank_fm_features

In [32]:
rank_model = DeepFM(
    rank_fm_features, rank_dnn_feats, 
    dnn_hidden_units=(1024,512,256,1), dnn_dropout=0.2, l2_dnn=0.2, dnn_bn=True
)

In [None]:
# ! mkdir ./logs
# %tensorboard --logdir './logs'

In [34]:
rank_model.compile(optimizer=tf.keras.optimizers.Adam(lr=3e-4), loss=binary_crossentropy)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='./rank_checkpoint/',
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs/', histogram_freq=1)
history = rank_model.fit(rank_train, rank_train_label,
                    batch_size=BATCH_SIZE*8, 
                    epochs=NEPOCH,
                    verbose=1,
                    validation_split=0.,
                    callbacks=[early_stop,checkpoint])#tensorboard_callback
rank_model.save_weights('youtuberank.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [35]:
rank_model.load_weights('youtuberank.h5')

In [36]:
del rank_train
gc.collect()

3188

In [37]:
pred = rank_model.predict(rank_test, batch_size=BATCH_SIZE*8)

In [38]:
pred_df = pd.DataFrame(columns=['user_id','movie_id','pred'])
pred_df['user_id'] = rank_test['user_id']
pred_df['movie_id'] = rank_test['movie_id']
pred_df['pred'] = pred

pred_df = pred_df.sort_values(by=['user_id','pred'], ascending=False).reset_index(drop=True)
pred_df = pred_df.groupby('user_id')['movie_id'].apply(list).reset_index()

In [39]:
test_label_df = pd.DataFrame(columns=['user_id','label'])
test_label_df['user_id'] = match_test['user_id']
test_label_df['label'] = match_test_label.tolist()

In [40]:
test_label_df = pd.merge(test_label_df, pred_df, on=['user_id'], how='left')

In [41]:
map_at_k(test_label_df['label'], test_label_df['movie_id'], k=10)

0.019221453537264795

In [42]:
recall_at_k(test_label_df['label'], test_label_df['movie_id'], k=10)

0.05612582781456953

In [43]:
recall_at_k(test_label_df['label'], test_label_df['movie_id'], k=100)

0.4773344370860928