# Import Libraries

In [None]:
!pip install -q protobuf==3.19.6
!pip install -q tensorflow==2.11.1 --no-deps
!pip install -q tensorflow-recommenders=='v0.7.3' --no-deps
!pip install -q tensorflow-datasets==3.2.0 --no-deps
!pip install -q tensorflow-metadata==0.22.2 --no-deps
!pip install -q scann
!pip install -q dill==0.3.6

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tensorflow-datasets 4.9.2 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.
tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Data Preparation

In [None]:
# Load ecommerce dataset
df_ecommerce = pd.read_csv('ecommerce.csv') # Contains transactions
df_products = pd.read_csv('products.csv')   # Contains products

In [None]:
# Convert dataframe to tensors
ds_ecommerce = tf.data.Dataset.from_tensor_slices(dict(df_ecommerce))
ds_products = tf.data.Dataset.from_tensor_slices(dict(df_products))

In [None]:
# Select fields
ds_ecommerce = ds_ecommerce.map(lambda x: {
    'user_id': tf.strings.as_string(x['user_id']),
    'product_id': tf.strings.as_string(x['product_id']),
    'age': x['age'],
    'search_query': x['search_query']
})

ds_products = tf.data.Dataset.from_tensor_slices(dict(df_products))
ds_products = ds_products.batch(32).map(lambda x: tf.strings.as_string(x['product_id']))

# Get all the user IDs
user_ids = ds_ecommerce.batch(1000000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_ecommerce.batch(1000000).map(lambda x: x["product_id"])
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [None]:
ds_products = tf.data.Dataset.from_tensor_slices(dict(df_products))
ds_products = ds_products.batch(500).map(lambda x: tf.strings.as_string(x['product_id']))

# Module 01 - Two-Towers (IDs Only)

In [None]:
# User and Product models.
class UserModel(tf.keras.Model):

  def __init__(self, unique_user_ids):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

  def call(self, inputs):
    return self.user_embedding(inputs)

class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_product_embeddings = self.product_model(features["product_id"])

    return self.task(user_embeddings, positive_product_embeddings)

# Instantiate and compile the model.
user_model = UserModel(unique_user_ids)
product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = tf.data.Dataset.from_tensor_slices(unique_product_ids).batch(128).map(product_model)

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
# Train for 3 epochs.
model.fit(ds_ecommerce.batch(1000), epochs=1)



<keras.callbacks.History at 0x7f4359b0e8c0>

In [None]:
K = 3

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Populate the index with the embeddings from the product model. And, perform this
# operation in batch with a size of 32 observations for efficiency.
product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# # Get some recommendations.
user_id = "1"
_, products = index(np.array([user_id]))
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")

Top 3 recommendations for user 1: [1116   11 1199]


# Module 02 - Two-Towers w/ Meta Data

In [None]:
# User and Product models.
class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

class UserModelMetaData(tf.keras.Model):

  def __init__(self, unique_user_id_list):
    super().__init__()

    # User ID Embedding
    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_id_list, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_id_list) + 1, 32),
    ])

    # User age
    self.age_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

  def call(self, inputs):
    user_emb = self.user_embedding(inputs['user_id'])
    age = self.age_normalizer(inputs['age'])
    return tf.concat([user_emb, tf.reshape(age, (-1, 1))], axis=1)

  def adapt(self, data):
    age_data = data.map(lambda x: x['age'])
    self.age_normalizer.adapt(age_data)

# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = tf.keras.Sequential([
      user_model,
      tf.keras.layers.Dense(32)
    ])
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model({
        'user_id': features['user_id'],
        'age': features['age']
    })
    positive_product_embeddings = self.product_model(features['product_id'])
    return self.task(user_embeddings, positive_product_embeddings)

# You need to gather the unique user ids and product ids to instantiate the models.
user_ids = ds_ecommerce.batch(1000000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_ecommerce.batch(1000000).map(lambda x: x["product_id"])
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

# Instantiate and compile the model.
user_model = UserModelMetaData(unique_user_ids)
# user_model.adapt(ds_ecommerce)

product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = ds_products.map(product_model)

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
# Train for 3 epochs.
model.fit(ds_ecommerce.batch(1000), epochs=1)





<keras.callbacks.History at 0x7f42983e9f90>

In [None]:
K = 3

# Get some recommendations.
user_id = tf.constant(["1"])  # user_id should be a string tensor
user_age = tf.constant([25])  # user_age should be a numeric tensor

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Add candidates in the index
product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# You need to pass the user_id and age in a dictionary.
_, products = index({"user_id": user_id, "age": user_age})
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")



Top 3 recommendations for user [b'1']: [1518 1256  890]


# Module 03 - Two-Towers w/ Search Query

In [None]:
# User and Product models.
class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

class UserModelSearchData(tf.keras.Model):

  def __init__(self, unique_user_id_list, max_tokens=1000, output_sequence_length=30):
    super().__init__()

    # User ID Embedding
    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_id_list, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_id_list) + 1, 32),
    ])

    # User age
    self.age_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

    # Search Query Embedding
    self.search_vectorization = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    self.search_embedding = tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True)

    self.dense = tf.keras.layers.Dense(32)

  def call(self, inputs):
    user_emb = self.user_embedding(inputs['user_id'])
    age = self.age_normalizer(inputs['age'])

    # Incorporate search history
    search_seq = self.search_vectorization(inputs['search_query'])
    search_emb = self.search_embedding(search_seq)
    search_emb = tf.reduce_mean(search_emb, axis=1)

    concatenated = tf.concat([user_emb, tf.reshape(age, (-1, 1)), search_emb], axis=1)
    return self.dense(concatenated)

  def adapt(self, data):
    age_data = data.map(lambda x: x['age'])
    self.age_normalizer.adapt(age_data)
    search_data = data.map(lambda x: x['search_query'])
    self.search_vectorization.adapt(search_data)


# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features)
    positive_product_embeddings = self.product_model(features['product_id'])

    return self.task(user_embeddings, positive_product_embeddings)

# Instantiate and compile the model.
user_model = UserModelSearchData(unique_user_ids)
user_model.adapt(ds_ecommerce)

product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = ds_products.map(product_model)

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
# Train for 3 epochs.
model.fit(ds_ecommerce.batch(1000), epochs=1)



<keras.callbacks.History at 0x7f4298647190>

In [None]:
# Set how many nearest products to retrieve
K = 3

# Get some recommendations.
user_id = tf.constant(["1"])  # user_id should be a string tensor
user_age = tf.constant([25])  # user_age should be a numeric tensor
user_search = tf.constant(['shirt'])
user_query = {"user_id": user_id, "age": user_age, "search_query": user_search}

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# Add candidates in the index
product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# You need to pass the user_id and age in a dictionary.
_, products = index(user_query, k=100)
print(f"Top {K} recommendations for user {user_id}: {products[0, :K]}")

Top 5 recommendations for user [b'1']: [2714 1219 1309 2425  377]


# Module 04 - Fast Retrieval

In [None]:
lots_of_embeddings = (
    ds_products
      .repeat(100)
      .map(lambda id: model.product_model(id))
      .map(lambda x: x * tf.random.uniform(tf.shape(x)))
)

In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model,
    # Number of leaves (clusters)
    num_leaves=100,
    # Top 10 clusters to search from the query to the centroid
    num_leaves_to_search=10
)

# Add candidates in the index
# product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
scann_index.index_from_dataset(lots_of_embeddings)

# You need to pass the user_id and age in a dictionary.
_, products = scann_index(user_query, k=100)
print(f"Top {K} recommendations for user {user_id}: {products[0, :K]}")

Top 5 recommendations for user [b'1']: [  7219 187309 224714 131714 224425]


In [None]:
%timeit -n 1000 _, products = scann_index(user_query, k=3)

7.36 ms ± 275 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Fine-Tune ScaNN

In [None]:
scann_index_v2 = tfrs.layers.factorized_top_k.ScaNN(
    model.user_model,
    # Number of leaves (clusters)
    num_leaves=100,
    # Top 3 clusters to search from the query to the centroid
    num_leaves_to_search=3,
    # Get top 100 candidates based on approximate distance,
    # which uses the distance between the centroid and query to approximate
    # the distance between query to neighbors. Then of those 100 candidates
    # sort based on the exact distance between the query and neighbors.
    # num_reordering_candidates=10
)

# Add candidates in the index
# product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
scann_index_v2.index_from_dataset(lots_of_embeddings)

# You need to pass the user_id and age in a dictionary.
_, products = scann_index_v2(user_query, k=100)
print(f"Top {K} recommendations for user {user_id}: {products[0, :K]}")

Top 5 recommendations for user [b'1']: [ 74714  47466 180490  92714 101466]


In [None]:
%timeit -n 1000 _, products = scann_index_v2(user_query, k=3)