In [141]:
import pandas as pd
import numpy as np
from src import configuration as config
from src.pipeline.evaluation.evaluation_utils import custom_train_test_split
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
from tensorflow.keras import layers

In [142]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
X_train, X_test, y_train, y_test = custom_train_test_split(df, factors=["dataset", "model", "tuning", "scoring"], target="rank")
df = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [143]:
# prepare the data
# train data
df['dataset'] = df['dataset'].astype(str)
df['features'] = df['dataset'].astype(str) + ' ' + df['model'] + ' ' + df['tuning'] + ' ' + df['scoring']
df = df.drop(columns=['dataset', 'model', 'tuning', 'scoring'])
print(df.dtypes)
df.head()

# test data
df_test['dataset'] = df_test['dataset'].astype(str)
df_test['features'] = df_test['dataset'].astype(str) + ' ' + df_test['model'] + ' ' + df_test['tuning'] + ' ' + df_test['scoring']
df_test = df_test.drop(columns=['dataset', 'model', 'tuning', 'scoring'])

encoder      object
rank        float64
features     object
dtype: object


In [144]:
df_tf = tf.data.Dataset.from_tensor_slices(dict(df))
df_tf_test = tf.data.Dataset.from_tensor_slices(dict(df_test))
print(type(df_tf))

<class 'tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset'>


In [145]:
import array
import collections

from typing import Dict, List, Optional, Text, Tuple

def _create_feature_dict() -> Dict[Text, List[tf.Tensor]]:
  """Helper function for creating an empty feature dict for defaultdict."""
  return {"encoder": [], "rank": []}


def _sample_list(
    feature_lists: Dict[Text, List[tf.Tensor]],
    num_examples_per_list: int,
    random_state: Optional[np.random.RandomState] = None,
) -> Tuple[tf.Tensor, tf.Tensor]:
  """Function for sampling a list example from given feature lists."""
  if random_state is None:
    random_state = np.random.RandomState()

  sampled_indices = random_state.choice(
      range(len(feature_lists["encoder"])),
      size=num_examples_per_list,
      replace=False,
  )
  sampled_movie_titles = [
      feature_lists["encoder"][idx] for idx in sampled_indices
  ]
  sampled_ratings = [
      feature_lists["rank"][idx]
      for idx in sampled_indices
  ]

  return (
      tf.stack(sampled_movie_titles, 0),
      tf.stack(sampled_ratings, 0),
  )


def sample_listwise(
    rating_dataset: tf.data.Dataset,
    num_list_per_user: int = 10,
    num_examples_per_list: int = 10,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
  """Function for converting the MovieLens 100K dataset to a listwise dataset.

  Args:
      rating_dataset:
        The MovieLens ratings dataset loaded from TFDS with features
        "movie_title", "user_id", and "user_rating".
      num_list_per_user:
        An integer representing the number of lists that should be sampled for
        each user in the training dataset.
      num_examples_per_list:
        An integer representing the number of movies to be sampled for each list
        from the list of movies rated by the user.
      seed:
        An integer for creating `np.random.RandomState`.

  Returns:
      A tf.data.Dataset containing list examples.

      Each example contains three keys: "user_id", "movie_title", and
      "user_rating". "user_id" maps to a string tensor that represents the user
      id for the example. "movie_title" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.string. It represents the list
      of candidate movie ids. "user_rating" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.float32. It represents the
      rating of each movie in the candidate list.
  """
  random_state = np.random.RandomState(seed)

  example_lists_by_user = collections.defaultdict(_create_feature_dict)

  movie_title_vocab = set()
  for example in rating_dataset:
    user_id = example["features"].numpy()
    example_lists_by_user[user_id]["encoder"].append(
        example["encoder"])
    example_lists_by_user[user_id]["rank"].append(
        example["rank"])
    movie_title_vocab.add(example["encoder"].numpy())

  tensor_slices = {"features": [], "encoder": [], "rank": []}

  for user_id, feature_lists in example_lists_by_user.items():
    for _ in range(num_list_per_user):

      # Drop the user if they don't have enough ratings.
      if len(feature_lists["encoder"]) < num_examples_per_list:
        continue

      sampled_movie_titles, sampled_ratings = _sample_list(
          feature_lists,
          num_examples_per_list,
          random_state=random_state,
      )
      tensor_slices["features"].append(user_id)
      tensor_slices["encoder"].append(sampled_movie_titles)
      tensor_slices["rank"].append(sampled_ratings)

  return tf.data.Dataset.from_tensor_slices(tensor_slices)

df_listwise = sample_listwise(df_tf)
df_listwise_test = sample_listwise(df_tf_test)

In [146]:
df_listwise

<_TensorSliceDataset element_spec={'features': TensorSpec(shape=(), dtype=tf.string, name=None), 'encoder': TensorSpec(shape=(10,), dtype=tf.string, name=None), 'rank': TensorSpec(shape=(10,), dtype=tf.float64, name=None)}>

In [147]:
import pprint
for example in df_listwise.take(1):
  pprint.pprint(example)

{'encoder': <tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'MHE', b'OHE', b'BUCV10RGLMME', b'PBTE001', b'BUCV5TE', b'CBE',
       b'CV2TE', b'DTEM5', b'DTEM10', b'BE'], dtype=object)>,
 'features': <tf.Tensor: shape=(), dtype=string, numpy=b'1114 KNC no F1'>,
 'rank': <tf.Tensor: shape=(10,), dtype=float64, numpy=array([25., 15., 19.,  3., 29.,  0.,  8., 23., 22., 21.])>}


In [148]:
cached_train = df_listwise.shuffle(100_000).batch(8192).cache()
cached_test = df_listwise_test.batch(4096).cache()

In [149]:
cached_train

<CacheDataset element_spec={'features': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'encoder': TensorSpec(shape=(None, 10), dtype=tf.string, name=None), 'rank': TensorSpec(shape=(None, 10), dtype=tf.float64, name=None)}>

In [150]:
# convert to a array containing all unique combinations of model, tuning, scoring as byte strings
# unique_factor_combinations = np.unique(df_listwise[['model', 'tuning', 'scoring']])
# unique_factor_combinations = unique_factor_combinations.astype('S')
# print(unique_factor_combinations)

# unique_model_combinations = np.unique(df_listwise['model'])
# unique_model_combinations = unique_factor_combinations.astype('S')

# unique_tuning_combinations = np.unique(df_listwise['tuning'])
# unique_tuning_combinations = unique_factor_combinations.astype('S')

unique_factor_combinations = np.unique(df[['features']])
unique_factor_combinations = unique_factor_combinations.astype('S')
print(unique_factor_combinations)

unique_encoder_rankings = np.unique(df[['encoder']])
unique_encoder_rankings = unique_encoder_rankings.astype('S')
print(unique_encoder_rankings)

[b'1037 DTC full AUC' b'1037 DTC full F1' b'1037 DTC model ACC'
 b'1037 DTC model F1' b'1037 DTC no F1' b'1037 KNC model AUC'
 b'1037 KNC model F1' b'1037 KNC no F1' b'1037 LGBMC no F1'
 b'1037 LR full ACC' b'1037 LR full F1' b'1037 LR model F1'
 b'1037 LR no ACC' b'1037 LR no AUC' b'1037 SVC full ACC'
 b'1037 SVC full AUC' b'1037 SVC full F1' b'1037 SVC no ACC'
 b'1037 SVC no AUC' b'1037 SVC no F1' b'1111 DTC model ACC'
 b'1111 DTC model AUC' b'1111 DTC no AUC' b'1111 DTC no F1'
 b'1111 KNC model ACC' b'1111 KNC model F1' b'1111 KNC no ACC'
 b'1111 LGBMC no ACC' b'1111 LGBMC no AUC' b'1111 LGBMC no F1'
 b'1111 LR model ACC' b'1111 LR model F1' b'1111 LR no AUC'
 b'1111 SVC no F1' b'1112 DTC model ACC' b'1112 DTC model F1'
 b'1112 DTC no F1' b'1112 KNC model ACC' b'1112 KNC model AUC'
 b'1112 KNC model F1' b'1112 KNC no F1' b'1112 LGBMC no F1'
 b'1112 LR model AUC' b'1112 LR no ACC' b'1112 LR no AUC' b'1112 LR no F1'
 b'1112 SVC no ACC' b'1112 SVC no F1' b'1114 DTC model F1'
 b'1114 DT

In [151]:
def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32
    # Compute embeddings for factor combinations.
    self.factors_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_factor_combinations),
      tf.keras.layers.Embedding(len(unique_factor_combinations) + 2, embedding_dimension)
    ])
    
    # Compute embeddings for encoder combinations.
    self.encoder_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_encoder_rankings),
      tf.keras.layers.Embedding(len(unique_encoder_rankings) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.factors_embeddings(features["features"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.encoder_embeddings(features["encoder"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    #list_length = features["encoder"].shape[1]
    # get list length for my shape (10,) tensor
    list_length = features["encoder"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("rank")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [152]:
epochs = 30
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [153]:
listwise_model.fit(cached_train, epochs=10, verbose=True)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1bddb7dd6d0>

In [155]:
listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the MSE Model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

NDCG of the MSE Model: 0.6106


In [159]:
prediction = listwise_model.predict(cached_test)
prediction.shape



(2890, 10, 1)