In [1]:
# Python built-ins
import sys

# FlexRec
import flexrec
from flexrec.features import QueryFeature
from flexrec.models import QueryModel, QueryReduction, RetrievalModel, RankingModel
from flexrec.trainers import MultitaskTrainer

# Vectorization
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

# Check Python & Tensorflow version
# Developed using Python 3.6.10 & TensorFlow 2.6.0
print('Python {}'.format(sys.version))
print('TensorFlow {}'.format(tf.__version__))

# Check processing units
# Troubleshoot 1: if no GPU is detected: https://www.tensorflow.org/install/gpu)
# Troubleshoot 2: if libcusolver.so.11 is not found: use ln -s to create alias from libcusolver.so.10
print('Physical devices: {}'.format(tf.config.list_physical_devices()))

Python 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 21:14:29) 
[GCC 7.3.0]
TensorFlow 2.6.0
Physical devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [2]:
# Load Movielens dataset (online)

# Ratings data
ratings = tfds.load('movielens/100k-ratings', split="train") # For Movielens, only "train" split is available

# Features of all the available movies
movies = tfds.load('movielens/100k-movies', split="train")

# Check one record from each dataset
for x in ratings.take(1).as_numpy_iterator(): print(x)
for x in movies.take(1).as_numpy_iterator(): print(x)

{'bucketized_user_age': 45.0, 'movie_genres': array([7]), 'movie_id': b'357', 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'raw_user_age': 46.0, 'timestamp': 879024327, 'user_gender': True, 'user_id': b'138', 'user_occupation_label': 4, 'user_occupation_text': b'doctor', 'user_rating': 4.0, 'user_zip_code': b'53211'}
{'movie_genres': array([4]), 'movie_id': b'1681', 'movie_title': b'You So Crazy (1994)'}


In [3]:
# Data preprocessing

# Filter unnecessary features & convert types
ratings = ratings.map(lambda x: {
            "bucketized_user_age": int(x["bucketized_user_age"]),
            "movie_id": x["movie_id"],
            "movie_title": x["movie_title"],
            "timestamp": float(x["timestamp"]),
            "user_gender": int(x["user_gender"]),
            "user_id": x["user_id"],
            "user_occupation_label": x["user_occupation_label"],
            "label": x["user_rating"],
            "user_zip_code": x["user_zip_code"],
          })
movies = movies.map(lambda x: {
           "movie_id": x["movie_id"],
           "movie_title": x["movie_title"],
         })

# Shuffle
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

# Split to train/test sets (in real applications, this must be done by dividing on time T)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# Cache dataset for efficiency
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [4]:
# Set up query feature configurations for building embedding tables
user_features = [
  QueryFeature('user_id', ratings.map(lambda x: x['user_id']), QueryFeature.STR_TYPE),
  QueryFeature('user_zip_code', ratings.map(lambda x: x['user_zip_code']), QueryFeature.STR_TYPE),
  QueryFeature('bucketized_user_age', ratings.map(lambda x: x['bucketized_user_age']), QueryFeature.INT_TYPE),
  QueryFeature('user_gender', ratings.map(lambda x: x['user_gender']), QueryFeature.INT_TYPE),
  QueryFeature('user_occupation_label', ratings.map(lambda x: x['user_occupation_label']), QueryFeature.INT_TYPE),
  QueryFeature('timestamp', ratings.map(lambda x: x['timestamp']), QueryFeature.CDI_TYPE),
  QueryFeature('timestamp', ratings.map(lambda x: x['timestamp']), QueryFeature.CNO_TYPE)
]
movie_features = [
  QueryFeature('movie_id', movies.map(lambda x: x['movie_id']), QueryFeature.STR_TYPE),
  QueryFeature('movie_title', movies.map(lambda x: x['movie_title']), QueryFeature.TXT_TYPE)
]

In [5]:
# Build query towers
embedding_dim = 32

user_model = QueryModel(features=[user_features[0], user_features[3]], embedding_dim=embedding_dim)
movie_model = QueryModel(features=movie_features, embedding_dim=embedding_dim)

for row in ratings.batch(1).take(1):
  print(user_model(row).shape, movie_model(row).shape)

(1, 2, 32) (1, 2, 32)


In [6]:
# Build retrieval model and train
layer_sizes = None
normalization = 0.9
regularization = 1e-7
gravitation = 1e-8

learning_rate = 0.2
num_epochs = 50

retrieval_model = RetrievalModel(user_model, movie_model, movies, 
                                 layer_sizes=layer_sizes,
                                 normalization=normalization,
                                 regularization=regularization, 
                                 gravitation=gravitation)
retrieval_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate))
retrieval_model.fit(cached_train, epochs=num_epochs, validation_data=cached_test, validation_freq=10)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50


Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f2034299320>

In [7]:
# Build ranking model and train
layer_sizes = [192, 192, 192]
use_interaction = True
projection_dim = None
regularization = 1e-1
label_name = 'label'

learning_rate = 0.05
num_epochs = 50

for row in ratings.batch(1).take(1):
  print(user_model(row).shape, movie_model(row).shape)

ranking_model = RankingModel(user_model, movie_model, 
                             layer_sizes=layer_sizes,
                             use_interaction=use_interaction,
                             projection_dim=projection_dim,
                             regularization=regularization,
                             label_name=label_name)
ranking_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
ranking_model.fit(cached_train, epochs=num_epochs, validation_data=cached_test, validation_freq=10)

(1, 2, 32) (1, 2, 32)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f202de12710>

In [8]:
learning_rate = 0.05
num_epochs = 50
retrieval_weight = 1.0
ranking_weight = 0.5

multitask_trainer = MultitaskTrainer([retrieval_model], [ranking_model], [retrieval_weight], [ranking_weight])
multitask_trainer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
multitask_trainer.fit(cached_train, epochs=num_epochs, validation_data=cached_test, validation_freq=10)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f202c2ee3c8>