In [None]:
!pip3 install -q tensorflow-recommenders==0.7.0
!pip3 install tensorflow==2.11.0

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from tabnanny import verbose
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      movie_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval,
      get_batch_users,
      get_batch_movies):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.movie_model = movie_model

    # Set up a retrieval task.
    self.task = task

    self.get_user = get_batch_users
    self.get_movie = get_batch_movies

  def compute_loss(self, features, training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(self.get_user(features))
    movie_embeddings = self.movie_model(self.get_movie(features))

    return self.task(user_embeddings, movie_embeddings)

class TFRecModel:
    def __init__(
        self,
        user_ids_vocabulary,
        movie_ids_vocabulary,
        movie_ids_dataset,
        get_batch_users,
        get_batch_movies,
        user_embedding_dim=64,
        movie_embedding_dim=64,
    ):
        self._movie_ids_dataset = movie_ids_dataset
        
        self._user_model = tf.keras.Sequential([
            user_ids_vocabulary,
            tf.keras.layers.Embedding(
                user_ids_vocabulary.vocabulary_size(), 
                user_embedding_dim)      
        ])
        
        self._movie_model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=[], dtype=tf.string),
            movie_ids_vocabulary,
            tf.keras.layers.Embedding(
                movie_ids_vocabulary.vocabulary_size(), 
                movie_embedding_dim)
        ])

        self._task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
            self._movie_ids_dataset.batch(128).map(self._movie_model)
          )
        )

        self._model = MovieLensModel(
            user_model=self._user_model, 
            movie_model=self._movie_model, 
            task=self._task, 
            get_batch_users=get_batch_users, 
            get_batch_movies=get_batch_movies)

        self._model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

    def train(
        self, 
        batched_dataset,
        epochs,
    ):
        self._model.fit(batched_dataset, epochs=epochs, verbose='1')
    
    def evaluate(
        self,
        user_ids,
        actually_watched,
        precision_at,
    ):
        index = tfrs.layers.factorized_top_k.BruteForce(self._user_model, k=max(precision_at))
        index.index_from_dataset(
            self._movie_ids_dataset.batch(100).map(lambda movie_id: (movie_id, self._movie_model(movie_id)))
        )

        _, recommendations = index(user_ids)

        for k in precision_at:
            rowwise_match_counts = self.get_rowwise_match_counts(recommendations[:, :k], actually_watched)
            print("Recall at", k, "=", self.recall_at_k(rowwise_match_counts, actually_watched))

    
    def get_rowwise_match_counts(self, recommendations, actually_watched):
        def make_intersect_rec_actual(n_actual):
            def intersect_rec_actual(matrix):
                split_at = matrix.shape[0] - n_actual
                recs, actuals = np.split(matrix, [split_at], axis=0)
                intersection = np.intersect1d(recs, actuals)
                return len(intersection)
            return intersect_rec_actual

        return np.apply_along_axis(
            func1d=make_intersect_rec_actual(actually_watched.shape[1]), 
            axis=1, 
            arr=np.concatenate(
                (recommendations, actually_watched), 
                axis=1)
            )
    
    def precision_at_k(self, rowwise_match_counts, k):
        n_rows = rowwise_match_counts.shape[0]
        return np.sum(rowwise_match_counts) / (n_rows * k)

    def recall_at_k(self, rowwise_match_counts, actually_watched):
        [n_rows, n_cols] = actually_watched.shape
        return np.sum(rowwise_match_counts) / (n_rows * n_cols)

    def categorical_accuracy(self, rowwise_match_counts):
        return np.sum(rowwise_match_counts > 0) / rowwise_match_counts.shape[0]

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from thirdai.demos import download_movielens

train_filename, test_filename, inference_batch, index_batch = download_movielens()


train_file = train_filename
test_file = test_filename

train_df = pd.read_csv(train_file)[["userId", "movieTitle"]].sample(frac=1) # .sample for shuffling
train_users = train_df["userId"]
train_movies = train_df["movieTitle"]
train_tfds = tf.data.Dataset.from_tensor_slices((train_users, train_movies))

test_df = pd.read_csv(test_file)[["userId", "movieTitle"]]
test_user_ids = np.array(test_df["userId"])
test_actually_watched = np.array(test_df[["movieTitle"]])

all_users = pd.concat([train_users, test_df["userId"]]).unique()
all_movies = pd.concat([train_movies, test_df["movieTitle"]]).unique()
movies_tfds = tf.data.Dataset.from_tensor_slices((all_movies, ))

print("Adapting vocabularies...")
user_ids_vocabulary = tf.keras.layers.IntegerLookup(mask_token=None)
user_ids_vocabulary.adapt(all_users)
movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_ids_vocabulary.adapt(all_movies)
print("Finised adapting vocabularies...")

model = TFRecModel(
    user_ids_vocabulary=user_ids_vocabulary,
    movie_ids_vocabulary=movie_ids_vocabulary,
    movie_ids_dataset=movies_tfds,
    get_batch_users=lambda batch: batch[0],
    get_batch_movies=lambda batch: batch[1],
)

model.train(
    batched_dataset=train_tfds.batch(2048),
    epochs=3,
)

Adapting vocabularies...
Finised adapting vocabularies...
Epoch 1/3
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Epoch 2/3
Epoch 3/3


In [3]:
model.evaluate(
    user_ids=test_user_ids,
    actually_watched=tf.convert_to_tensor(test_actually_watched),
    precision_at=[1, 10, 100],
)

Recall at 1 = 2.999370132272223e-05
Recall at 10 = 0.0016696493736315374
Recall at 100 = 0.0398016416552524
