In [60]:
import pandas as pd
import numpy as np
from src import configuration as config
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

In [61]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"],
})
movies = movies.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

In [62]:
print(unique_movie_titles)
print(movies)

[b"'Til There Was You (1997)" b'1-900 (1994)' b'101 Dalmatians (1996)' ...
 b'Zeus and Roxanne (1997)' b'unknown'
 b'\xc3\x81 k\xc3\xb6ldum klaka (Cold Fever) (1994)']
<_MapDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>


In [63]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [64]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    print(features)
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")
    print(f"Labels: \n {labels}")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [65]:
epochs = 1

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [66]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [67]:
cached_train

<CacheDataset element_spec={'user_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(None, 5), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(None, 5), dtype=tf.float32, name=None)}>

In [70]:
for row in cached_train.take(2):
  print(row)

{'user_id': <tf.Tensor: shape=(8192,), dtype=string, numpy=array([b'60', b'406', b'116', ..., b'821', b'214', b'139'], dtype=object)>, 'movie_title': <tf.Tensor: shape=(8192, 5), dtype=string, numpy=
array([[b'Treasure of the Sierra Madre, The (1948)',
        b'Return of the Pink Panther, The (1974)',
        b'Terminator, The (1984)', b'Mary Poppins (1964)',
        b'Monty Python and the Holy Grail (1974)'],
       [b'Angels and Insects (1995)', b'Sting, The (1973)',
        b'Clockers (1995)', b'Terminator 2: Judgment Day (1991)',
        b'Backbeat (1993)'],
       [b'It Happened One Night (1934)',
        b'Lost World: Jurassic Park, The (1997)', b'Crash (1996)',
        b'Secrets & Lies (1996)', b'Hercules (1997)'],
       ...,
       [b'Apocalypse Now (1979)', b'Independence Day (ID4) (1996)',
        b'Family Thing, A (1996)', b'Eraser (1996)',
        b'That Thing You Do! (1996)'],
       [b'Secrets & Lies (1996)', b'Lost Highway (1997)',
        b'Pulp Fiction (1994)', b'Wel

In [69]:
listwise_model.fit(cached_train, epochs=epochs, verbose=True)

Labels: 
 Tensor("IteratorGetNext:2", shape=(None, 5), dtype=float32)
{'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None, 5) dtype=string>}
Labels: 
 Tensor("IteratorGetNext:2", shape=(None, 5), dtype=float32)
{'user_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=string>, 'movie_title': <tf.Tensor 'IteratorGetNext:0' shape=(None, 5) dtype=string>}


<keras.src.callbacks.History at 0x26dec32b9d0>