In this tutorial, we build a simple matrix factorization model using the MovieLens 100K dataset with TFRS. We can use this model to recommend movies for a given user.

In [38]:
from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [3]:
import os

In [4]:
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train", data_dir=os.getcwd())
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train", data_dir=os.getcwd())

In [5]:
# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"]
})
movies = movies.map(lambda x: x["movie_title"])

The ratings dataset is likely a tf.data.Dataset object where each element is a dictionary with keys like "user_id", "movie_id", etc.

In [6]:
ratings

<_MapDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [7]:
next(iter(ratings.take(1)))

{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'138'>}

In [8]:
movies

<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [9]:
next(iter(movies.take(1)))

<tf.Tensor: shape=(), dtype=string, numpy=b'You So Crazy (1994)'>

In [10]:
for movie in movies.take(5):
    print(movie)



tf.Tensor(b'You So Crazy (1994)', shape=(), dtype=string)
tf.Tensor(b'Love Is All There Is (1996)', shape=(), dtype=string)
tf.Tensor(b'Fly Away Home (1996)', shape=(), dtype=string)
tf.Tensor(b'In the Line of Duty 2 (1987)', shape=(), dtype=string)
tf.Tensor(b'Niagara, Niagara (1997)', shape=(), dtype=string)


Build vocabularies to convert user ids and movie titles into integer indices for embedding layers

In [11]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)  #'StringLookup' object

In [12]:
user_ids_vocabulary

<StringLookup name=string_lookup, built=False>

The ratings dataset is likely a tf.data.Dataset object where each element is a dictionary with keys like "user_id", "movie_id", etc.
The map function extracts the "user_id" field from each element in the dataset, creating a dataset of user IDs.

In [None]:
# [{"user_id": "user1", "movie_id": "movieA"}, 
#  {"user_id": "user2", "movie_id": "movieB"}]

# to

# ["user1", "user2"]


In [13]:
user_ids_vocabulary.adapt(ratings.map(lambda x: x["user_id"])) #'StringLookup' method

In [14]:
user_ids_vocabulary

<StringLookup name=string_lookup, built=False>

In [None]:
# tf.keras.layers.StringLookup(mask_token=None)
# Purpose:
# This is a Keras preprocessing layer used to map string inputs (e.g., user IDs) to integer indices.
# Parameters:
# mask_token=None: Disables masking, meaning no special "mask" token will be included in the vocabulary for missing values.
# 2. user_ids_vocabulary.adapt(...)
# What it does:
# The adapt method analyzes the input data to build a vocabulary of unique user IDs.
# After calling adapt, the layer will "learn" all the unique user IDs present in the dataset and assign a unique integer index to each of them.
# For example, if the dataset has user IDs ["user1", "user2", "user3"], the layer might assign:
# "user1" → 1
# "user2" → 2
# "user3" → 3
# By default, index 0 is reserved for "out-of-vocabulary" (OOV) values that don't appear in the dataset.


In [None]:
movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None) # which is used to convert string inputs into numerical representations.
movie_titles_vocabulary.adapt(movies)

In [16]:
print(user_ids_vocabulary.get_vocabulary())


['[UNK]', '405', '655', '13', '450', '276', '416', '537', '303', '234', '393', '181', '279', '429', '846', '7', '94', '682', '308', '92', '293', '222', '201', '59', '435', '378', '880', '417', '896', '592', '796', '758', '561', '130', '406', '551', '334', '804', '268', '474', '889', '269', '727', '399', '642', '916', '145', '650', '363', '151', '524', '749', '194', '387', '90', '648', '291', '864', '311', '747', '85', '286', '327', '653', '328', '385', '299', '497', '95', '271', '457', '18', '301', '532', '374', '805', '178', '1', '389', '870', '716', '883', '833', '472', '437', '313', '533', '881', '280', '339', '504', '184', '788', '894', '666', '314', '506', '932', '886', '798', '244', '343', '707', '606', '454', '109', '373', '354', '782', '62', '345', '790', '487', '207', '622', '892', '407', '588', '500', '774', '660', '312', '305', '711', '43', '535', '919', '854', '456', '618', '200', '102', '49', '495', '87', '6', '851', '868', '60', '256', '643', '452', '144', '843', '807', '

In [17]:
print(movie_titles_vocabulary.get_vocabulary())


['[UNK]', "Ulee's Gold (1997)", 'That Darn Cat! (1997)', 'Substance of Fire, The (1996)', 'Sliding Doors (1998)', 'Nightwatch (1997)', 'Money Talks (1997)', 'Kull the Conqueror (1997)', 'Ice Storm, The (1997)', 'Hurricane Streets (1998)', 'Hugo Pool (1997)', 'Fly Away Home (1996)', 'Desperate Measures (1998)', 'Designated Mourner, The (1997)', 'Deceiver (1997)', 'Chasing Amy (1997)', 'Chairman of the Board (1998)', 'Butcher Boy, The (1998)', 'Body Snatchers (1993)', 'Á köldum klaka (Cold Fever) (1994)', 'unknown', 'Zeus and Roxanne (1997)', "Young Poisoner's Handbook, The (1995)", 'Young Guns II (1990)', 'Young Guns (1988)', 'Young Frankenstein (1974)', 'You So Crazy (1994)', 'Year of the Horse (1997)', 'Yankee Zulu (1994)', 'Wyatt Earp (1994)', 'Wrong Trousers, The (1993)', 'World of Apu, The (Apur Sansar) (1959)', "Wooden Man's Bride, The (Wu Kui) (1994)", 'Wonderland (1997)', 'Wonderful, Horrible Life of Leni Riefenstahl, The (1993)', 'Women, The (1939)', 'Woman in Question, The (19

In [18]:
for movie in movies.take(5):
    print(movie)


tf.Tensor(b'You So Crazy (1994)', shape=(), dtype=string)
tf.Tensor(b'Love Is All There Is (1996)', shape=(), dtype=string)
tf.Tensor(b'Fly Away Home (1996)', shape=(), dtype=string)
tf.Tensor(b'In the Line of Duty 2 (1987)', shape=(), dtype=string)
tf.Tensor(b'Niagara, Niagara (1997)', shape=(), dtype=string)


Define a model
We can define a TFRS model by inheriting from tfrs.Model and implementing the compute_loss method:

User Embeddings:

self.user_model(features["user_id"])
The function takes the user_id tensor from the features dictionary and passes it through self.user_model.
self.user_model is likely an embedding layer or neural network that converts user IDs into dense vectors (embeddings).

In [19]:
class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["user_id"])
    movie_embeddings = self.movie_model(features["movie_title"])

    return self.task(user_embeddings, movie_embeddings)

Keys like "user_id" and "movie_title" point to tensors representing user and movie identifiers, respectively.

self.user_model(features["user_id"])
The function takes the user_id tensor from the features dictionary and passes it through self.user_model.
self.user_model is likely an embedding layer or neural network that converts user IDs into dense vectors (embeddings).

Define the two models and the retrieval task.

In [28]:
user_ids_vocabulary.vocabulary_size()

944

In [29]:
movie_titles_vocabulary.vocabulary_size()

1665

In [31]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(input_dim=user_ids_vocabulary.vocabulary_size(), output_dim=64)
])
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(input_dim=movie_titles_vocabulary.vocabulary_size(), output_dim=64)
])



In [32]:
user_model.summary()

tfrs.tasks.Retrieval:
This class is used to define a retrieval task in a recommendation system.
The goal of a retrieval task is to select a small set of relevant candidates (e.g., movies) from a large pool based on some relevance measure.
It handles:
Computing the loss (e.g., contrastive loss for user-movie embeddings).
Tracking metrics to evaluate model performance (e.g., top-k accuracy).

metrics=tfrs.metrics.FactorizedTopK(...):
Specifies the evaluation metrics for the retrieval task.
FactorizedTopK computes ranking metrics such as recall and precision based on the top-k most relevant items predicted for a user.

movies.batch(128).map(movie_model):
Purpose: Defines the candidate set (all potential movies) for the top-k evaluation.
Process:
movies: Likely a tf.data.Dataset containing all the movies in the dataset.
.batch(128): Groups the movies into batches of 128 for efficient processing.
.map(movie_model): Converts raw movie data into embeddings using the movie_model.
The result is a batched and embedded representation of all movies, which will be used to compare against user embeddings.


In [33]:
movie_embeddings =movies.batch(128).map(movie_model)
  

In [34]:
movie_embeddings

<_MapDataset element_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None)>

In [35]:
next(iter(movie_embeddings.take(1)))

<tf.Tensor: shape=(128, 64), dtype=float32, numpy=
array([[ 0.02136875,  0.02848491, -0.00921426, ...,  0.02290625,
         0.04572007, -0.01772724],
       [-0.02265123, -0.04321151, -0.03735344, ..., -0.00237501,
        -0.01710071,  0.0100952 ],
       [-0.02893539,  0.03342885,  0.00305834, ..., -0.00753188,
        -0.03864139,  0.0211953 ],
       ...,
       [ 0.04203833, -0.00471202, -0.03317744, ..., -0.04513462,
         0.00371362,  0.04884627],
       [-0.01309742,  0.02871958, -0.01162573, ..., -0.03960372,
         0.03276548,  0.02968359],
       [ 0.04075832, -0.00505923,  0.037194  , ..., -0.04433644,
         0.00563056,  0.03548136]], dtype=float32)>

In [36]:
for movie in movie_embeddings:
    print(movie)
    print()

tf.Tensor(
[[ 0.02136875  0.02848491 -0.00921426 ...  0.02290625  0.04572007
  -0.01772724]
 [-0.02265123 -0.04321151 -0.03735344 ... -0.00237501 -0.01710071
   0.0100952 ]
 [-0.02893539  0.03342885  0.00305834 ... -0.00753188 -0.03864139
   0.0211953 ]
 ...
 [ 0.04203833 -0.00471202 -0.03317744 ... -0.04513462  0.00371362
   0.04884627]
 [-0.01309742  0.02871958 -0.01162573 ... -0.03960372  0.03276548
   0.02968359]
 [ 0.04075832 -0.00505923  0.037194   ... -0.04433644  0.00563056
   0.03548136]], shape=(128, 64), dtype=float32)

tf.Tensor(
[[-4.2969324e-02  2.1216456e-02  3.7999678e-02 ...  4.0883150e-02
  -2.6352717e-02 -3.3959676e-02]
 [-3.7241112e-02  4.3316435e-02  1.5042249e-02 ...  8.4383599e-03
  -3.3847034e-02 -4.1530598e-02]
 [-4.7661450e-02 -9.0553761e-03 -4.8649110e-02 ... -5.5896752e-03
   3.7194382e-02  2.5283445e-02]
 ...
 [ 1.4398802e-02 -1.6190074e-02  9.3578175e-04 ...  2.9264558e-02
  -1.6621254e-02  1.3226535e-02]
 [ 3.5157751e-02 -1.6982187e-02  3.5500590e-02 ... 

In [39]:
# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    candidates=movie_embeddings
))

ValueError: Cannot convert '('c', 'o', 'u', 'n', 't', 'e', 'r')' to a shape. Found invalid entry 'c' of type '<class 'str'>'. 

Fit and evaluate it.
Create the model, train it, and generate predictions:

In [None]:
# Create a retrieval model.
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(ratings.batch(4096), epochs=3)

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    movies.batch(100).map(lambda title: (title, model.movie_model(title))))

# Get some recommendations.
_, titles = index(np.array(["42"]))
print(f"Top 3 recommendations for user 42: {titles[0, :3]}")