In [94]:
import pandas as pd
import numpy as np
from src import configuration as config
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
from tensorflow.keras import layers

In [110]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
df['dataset'] = df['dataset'].astype(str)
print(df.dtypes)
df.head()
features = ['dataset', 'model', 'tuning', 'scoring']

dataset     object
model       object
tuning      object
scoring     object
encoder     object
rank       float64
dtype: object


In [96]:
df_tf = tf.data.Dataset.from_tensor_slices(dict(df))
print(type(df_tf))

<class 'tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset'>


In [97]:
import array
import collections

from typing import Dict, List, Optional, Text, Tuple

def _create_feature_dict() -> Dict[Text, List[tf.Tensor]]:
  """Helper function for creating an empty feature dict for defaultdict."""
  return {"encoder": [], "rank": []}


def _sample_list(
    feature_lists: Dict[Text, List[tf.Tensor]],
    num_examples_per_list: int,
    random_state: Optional[np.random.RandomState] = None,
) -> Tuple[tf.Tensor, tf.Tensor]:
  """Function for sampling a list example from given feature lists."""
  if random_state is None:
    random_state = np.random.RandomState()

  sampled_indices = random_state.choice(
      range(len(feature_lists["encoder"])),
      size=num_examples_per_list,
      replace=False,
  )
  sampled_movie_titles = [
      feature_lists["encoder"][idx] for idx in sampled_indices
  ]
  sampled_ratings = [
      feature_lists["rank"][idx]
      for idx in sampled_indices
  ]

  return (
      tf.stack(sampled_movie_titles, 0),
      tf.stack(sampled_ratings, 0),
  )


def sample_listwise(
    rating_dataset: tf.data.Dataset,
    num_list_per_user: int = 10,
    num_examples_per_list: int = 10,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
  """Function for converting the MovieLens 100K dataset to a listwise dataset.

  Args:
      rating_dataset:
        The MovieLens ratings dataset loaded from TFDS with features
        "movie_title", "user_id", and "user_rating".
      num_list_per_user:
        An integer representing the number of lists that should be sampled for
        each user in the training dataset.
      num_examples_per_list:
        An integer representing the number of movies to be sampled for each list
        from the list of movies rated by the user.
      seed:
        An integer for creating `np.random.RandomState`.

  Returns:
      A tf.data.Dataset containing list examples.

      Each example contains three keys: "user_id", "movie_title", and
      "user_rating". "user_id" maps to a string tensor that represents the user
      id for the example. "movie_title" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.string. It represents the list
      of candidate movie ids. "user_rating" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.float32. It represents the
      rating of each movie in the candidate list.
  """
  random_state = np.random.RandomState(seed)

  example_lists_by_user = collections.defaultdict(_create_feature_dict)

  movie_title_vocab = set()
  for example in rating_dataset:
    user_id = example["dataset"].numpy()
    example_lists_by_user[user_id]["encoder"].append(
        example["encoder"])
    example_lists_by_user[user_id]["rank"].append(
        example["rank"])
    movie_title_vocab.add(example["encoder"].numpy())

  tensor_slices = {"dataset": [], "encoder": [], "rank": []}

  for user_id, feature_lists in example_lists_by_user.items():
    for _ in range(num_list_per_user):

      # Drop the user if they don't have enough ratings.
      if len(feature_lists["encoder"]) < num_examples_per_list:
        continue

      sampled_movie_titles, sampled_ratings = _sample_list(
          feature_lists,
          num_examples_per_list,
          random_state=random_state,
      )
      tensor_slices["dataset"].append(user_id)
      tensor_slices["encoder"].append(sampled_movie_titles)
      tensor_slices["rank"].append(sampled_ratings)

  return tf.data.Dataset.from_tensor_slices(tensor_slices)

df_listwise = sample_listwise(df_tf)

In [98]:
df_listwise

<_TensorSliceDataset element_spec={'dataset': TensorSpec(shape=(), dtype=tf.string, name=None), 'encoder': TensorSpec(shape=(10,), dtype=tf.string, name=None), 'rank': TensorSpec(shape=(10,), dtype=tf.float64, name=None)}>

In [99]:
import pprint
for example in df_listwise.take(1):
  pprint.pprint(example)

{'dataset': <tf.Tensor: shape=(), dtype=string, numpy=b'1169'>,
 'encoder': <tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'CV2RGLMME', b'CV10RGLMME', b'CV5RGLMME', b'DTEM10', b'CV2TE',
       b'CV10RGLMME', b'CV2RGLMME', b'BE', b'CBE', b'OE'], dtype=object)>,
 'rank': <tf.Tensor: shape=(10,), dtype=float64, numpy=array([15., 23., 17., 24.,  6.,  7., 18., 15.,  0., 25.])>}


In [100]:
epochs = 1

cached_train = df_listwise.shuffle(100_000).batch(8192).cache()

In [101]:
cached_train

<CacheDataset element_spec={'dataset': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'encoder': TensorSpec(shape=(None, 10), dtype=tf.string, name=None), 'rank': TensorSpec(shape=(None, 10), dtype=tf.float64, name=None)}>

In [102]:
# convert to a array containing all unique combinations of model, tuning, scoring as byte strings
# unique_factor_combinations = np.unique(df_listwise[['model', 'tuning', 'scoring']])
# unique_factor_combinations = unique_factor_combinations.astype('S')
# print(unique_factor_combinations)

# unique_model_combinations = np.unique(df_listwise['model'])
# unique_model_combinations = unique_factor_combinations.astype('S')

# unique_tuning_combinations = np.unique(df_listwise['tuning'])
# unique_tuning_combinations = unique_factor_combinations.astype('S')

unique_factor_combinations = np.unique(df[['dataset']])
unique_factor_combinations = unique_factor_combinations.astype('S')
print(unique_factor_combinations)

unique_encoder_rankings = np.unique(df[['encoder']])
unique_encoder_rankings = unique_encoder_rankings.astype('S')
print(unique_encoder_rankings)

[b'1037' b'1111' b'1112' b'1114' b'1169' b'1235' b'1461' b'1463' b'1486'
 b'1506' b'1511' b'1590' b'23381' b'29' b'3' b'31' b'333' b'334' b'38'
 b'40536' b'40945' b'40981' b'40999' b'41005' b'41007' b'41162' b'41224'
 b'42178' b'42343' b'42344' b'42738' b'42750' b'43098' b'43607' b'43890'
 b'43892' b'43896' b'43897' b'43900' b'43922' b'451' b'470' b'50' b'51'
 b'56' b'6332' b'881' b'956' b'959' b'981']
[b'BE' b'BUCV10RGLMME' b'BUCV10TE' b'BUCV2RGLMME' b'BUCV2TE'
 b'BUCV5RGLMME' b'BUCV5TE' b'CBE' b'CE' b'CV10RGLMME' b'CV10TE'
 b'CV2RGLMME' b'CV2TE' b'CV5RGLMME' b'CV5TE' b'DE' b'DTEM10' b'DTEM2'
 b'DTEM5' b'ME01E' b'ME10E' b'ME1E' b'MHE' b'OE' b'OHE' b'PBTE0001'
 b'PBTE001' b'PBTE01' b'RGLMME' b'SE' b'TE' b'WOEE']


In [107]:
def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32
    print("STARTING INIT")
    # Compute embeddings for factor combinations.
    self.factors_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_factor_combinations),
      tf.keras.layers.Embedding(len(unique_factor_combinations) + 2, embedding_dimension)
    ])
    
    # Compute embeddings for encoder combinations.
    self.encoder_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_encoder_rankings),
      tf.keras.layers.Embedding(len(unique_encoder_rankings) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )
    print("FINISHED INIT")

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.factors_embeddings(features["dataset"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.encoder_embeddings(features["encoder"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    print(features["encoder"].shape)
    print(features["dataset"].shape)
    #list_length = features["encoder"].shape[1]
    # get list length for my shape (10,) tensor
    list_length = features["encoder"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("rank")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [108]:
epochs = 30
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

STARTING INIT
FINISHED INIT


In [109]:
listwise_model.fit(cached_train, epochs=2, verbose=True)

Epoch 1/2
(None, 10)


(None,)
(None, 10)
(None,)
Epoch 2/2


<keras.src.callbacks.History at 0x1bdd89232d0>