In [94]:
import pandas as pd
import numpy as np
from src import configuration as config
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

In [95]:
def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
      values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

In [96]:
# load the data
df = config.load_traindata_for_pointwise()
df = df.drop(columns=['cv_score'])
print(df.dtypes)
df.head()

dataset      int64
model       object
tuning      object
scoring     object
encoder     object
rank       float64
dtype: object


Unnamed: 0,dataset,model,tuning,scoring,encoder,rank
0,1169,KNC,model,ACC,BUCV2RGLMME,16.0
1,1169,KNC,model,ACC,BUCV2TE,14.0
2,1169,KNC,model,ACC,CBE,22.0
3,1169,KNC,model,ACC,CE,23.0
4,1169,KNC,model,ACC,CV10RGLMME,7.0


In [97]:
def create_encoder_rankings(df):
    # Group the DataFrame by 'dataset', 'model', 'tuning', and 'scoring' columns
    grouped_df = df.groupby(['dataset', 'model', 'tuning', 'scoring'])
    
    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder_rankings'])
    
    for group_keys, group_data in grouped_df:
        dataset, model, tuning, scoring = group_keys
        encoder_rankings = group_data.sort_values('rank', ascending=False)['encoder'].tolist()
        rankings = group_data.sort_values('rank', ascending=False)['rank'].tolist()
        new_row = {'dataset': dataset, 'model': model, 'tuning': tuning, 'scoring': scoring,
                   'encoder_rankings': [encoder_rankings], 'ranking': [rankings]}
        new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    
    return new_df

# Your original DataFrame
data = {
    'dataset': [1169, 1169, 1169, 1169, 1169],
    'model': ['KNC', 'KNC', 'KNC', 'KNC', 'KNC'],
    'tuning': ['model', 'model', 'model', 'model', 'model'],
    'scoring': ['ACC', 'ACC', 'ACC', 'ACC', 'ACC'],
    'encoder': ['BUCV2RGLMME', 'BUCV2TE', 'CBE', 'CE', 'CV10RGLMME'],
    'rank': [16.0, 14.0, 22.0, 23.0, 7.0]
}

test_df = pd.DataFrame(data)

# Call the method to create the new DataFrame
new_dataframe = create_encoder_rankings(test_df)

# Display the new DataFrame
print(new_dataframe)

  dataset model tuning scoring                               encoder_rankings  \
0    1169   KNC  model     ACC  [[CE, CBE, BUCV2RGLMME, BUCV2TE, CV10RGLMME]]   

                           ranking  
0  [[23.0, 22.0, 16.0, 14.0, 7.0]]  


In [98]:
df_listwise = create_encoder_rankings(df)
print(df_listwise.head())

  dataset model tuning scoring  \
0       3   DTC   full     ACC   
1       3   DTC   full     AUC   
2       3   DTC   full      F1   
3       3   DTC  model     AUC   
4       3   DTC  model      F1   

                                    encoder_rankings  \
0  [[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...   
1  [[DE, CBE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...   
2  [[CBE, DE, PBTE01, BE, OE, ME01E, ME10E, ME1E,...   
3  [[DE, CBE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...   
4  [[CBE, DE, PBTE01, CV2TE, CV2RGLMME, CV5RGLMME...   

                                             ranking  
0  [[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...  
1  [[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...  
2  [[4.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...  
3  [[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18...  
4  [[25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0, 18...  


In [86]:
numerical_feature_names = ['dataset']
categorical_feature_names = ['model', 'tuning', 'scoring', 'encoder_rankings']
target = df.pop('rank')

df = df_listwise

inputs = {}
for name, column in df.items():
  if type(column[0]) == str:
    dtype = tf.string
  elif (name in categorical_feature_names or
        name in numerical_feature_names):
    dtype = tf.int64
  else:
    dtype = tf.float32

  inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
  
print(inputs)

{'dataset': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'dataset')>, 'model': <KerasTensor: shape=(None,) dtype=string (created by layer 'model')>, 'tuning': <KerasTensor: shape=(None,) dtype=string (created by layer 'tuning')>, 'scoring': <KerasTensor: shape=(None,) dtype=string (created by layer 'scoring')>, 'encoder_rankings': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'encoder_rankings')>}


In [88]:
preprocessed = []

# preprocessed.append(inputs['dataset'])
# preprocessed.append(inputs['rank'])

numeric_features = df[numerical_feature_names]

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(stack_dict(dict(numeric_features)))

numeric_inputs = {}
for name in numerical_feature_names:
  numeric_inputs[name]=inputs[name]

numeric_inputs = stack_dict(numeric_inputs)
numeric_normalized = normalizer(numeric_inputs)

preprocessed.append(numeric_normalized)

for name in categorical_feature_names:
  vocab = sorted(set(df[name]))
  print(f'name: {name}')
  print(f'vocab: {vocab}\n')

  if type(vocab[0]) is str:
    lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
  else:
    lookup = tf.keras.layers.IntegerLookup(vocabulary=vocab, output_mode='one_hot')

  x = inputs[name][:, tf.newaxis]
  x = lookup(x)
  preprocessed.append(x)
  
  print(preprocessed)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [55]:
preprocesssed_result = tf.concat(preprocessed, axis=-1)
preprocesssed_result

<KerasTensor: shape=(None, 48) dtype=float32 (created by layer 'tf.concat_3')>

In [56]:
preprocessor = tf.keras.Model(inputs, preprocesssed_result)

In [57]:
body = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(1)
])

In [66]:
x = preprocessor(inputs)
x

<KerasTensor: shape=(None, 48) dtype=float32 (created by layer 'model_6')>

In [59]:
result = body(x)
result

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'sequential_3')>

In [60]:
model = tf.keras.Model(inputs, result)

model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])

In [61]:
history = model.fit(dict(df), target, epochs=5, batch_size=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"],
})
movies = movies.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Marco\tensorflow_datasets\movielens\100k-movies\0.1.1...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling C:\Users\Marco\tensorflow_datasets\movielens\100k-movies\0.1.1.incomplete84VJJ8\movielens-train.tfre…

[1mDataset movielens downloaded and prepared to C:\Users\Marco\tensorflow_datasets\movielens\100k-movies\0.1.1. Subsequent calls will reuse this data.[0m


In [90]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [91]:
import pprint
for example in train.take(1):
  pprint.pprint(example)

{'movie_title': <tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Postman, The (1997)', b'Liar Liar (1997)', b'Contact (1997)',
       b'Welcome To Sarajevo (1997)',
       b'I Know What You Did Last Summer (1997)'], dtype=object)>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>,
 'user_rating': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4., 5., 1., 4., 1.], dtype=float32)>}


In [99]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [100]:
epochs = 30

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [103]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
listwise_model.fit(cached_train, epochs=epochs, verbose=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30

In [67]:

import array
import collections

from typing import Dict, List, Optional, Text, Tuple

def _create_feature_dict() -> Dict[Text, List[tf.Tensor]]:
  """Helper function for creating an empty feature dict for defaultdict."""
  return {"encoder": [], "rank": []}

def _sample_list(
    feature_lists: Dict[Text, List[tf.Tensor]],
    num_examples_per_list: int,
    random_state: Optional[np.random.RandomState] = None,
) -> Tuple[tf.Tensor, tf.Tensor]:
  """Function for sampling a list example from given feature lists."""
  if random_state is None:
    random_state = np.random.RandomState()

  sampled_indices = random_state.choice(
      range(len(feature_lists["encoder"])),
      size=num_examples_per_list,
      replace=False,
  )
  sampled_movie_titles = [
      feature_lists["encoder"][idx] for idx in sampled_indices
  ]
  sampled_ratings = [
      feature_lists["rank"][idx]
      for idx in sampled_indices
  ]

  return (
      tf.stack(sampled_movie_titles, 0),
      tf.stack(sampled_ratings, 0),
  )

def sample_listwise(
    rating_dataset: tf.data.Dataset,
    num_list_per_user: int = 10,
    num_examples_per_list: int = 10,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
  """Function for converting the MovieLens 100K dataset to a listwise dataset.

  Args:
      rating_dataset:
        The MovieLens ratings dataset loaded from TFDS with features
        "movie_title", "user_id", and "user_rating".
      num_list_per_user:
        An integer representing the number of lists that should be sampled for
        each user in the training dataset.
      num_examples_per_list:
        An integer representing the number of movies to be sampled for each list
        from the list of movies rated by the user.
      seed:
        An integer for creating `np.random.RandomState`.

  Returns:
      A tf.data.Dataset containing list examples.

      Each example contains three keys: "user_id", "movie_title", and
      "user_rating". "user_id" maps to a string tensor that represents the user
      id for the example. "movie_title" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.string. It represents the list
      of candidate movie ids. "user_rating" maps to a tensor of shape
      [sum(num_example_per_list)] with dtype tf.float32. It represents the
      rating of each movie in the candidate list.
  """
  random_state = np.random.RandomState(seed)

  example_lists_by_user = collections.defaultdict(_create_feature_dict)

  movie_title_vocab = set()
  for example in rating_dataset:
    user_id = example["dataset"].numpy()
    example_lists_by_user[user_id]["encoder"].append(
        example["encoder"])
    example_lists_by_user[user_id]["rank"].append(
        example["rank"])
    movie_title_vocab.add(example["encoder"].numpy())

  tensor_slices = {"dataset": [], "encoder": [], "rank": []}

  for user_id, feature_lists in example_lists_by_user.items():
    for _ in range(num_list_per_user):

      # Drop the user if they don't have enough ratings.
      if len(feature_lists["encoder"]) < num_examples_per_list:
        continue

      sampled_movie_titles, sampled_ratings = _sample_list(
          feature_lists,
          num_examples_per_list,
          random_state=random_state,
      )
      tensor_slices["dataset"].append(user_id)
      tensor_slices["encoder"].append(sampled_movie_titles)
      tensor_slices["rank"].append(sampled_ratings)

  return tf.data.Dataset.from_tensor_slices(tensor_slices)

test = sample_listwise(x, 1, 5, 42)
print(test)

TypeError: Cannot iterate over a Tensor with unknown first dimension.