# Import Libraries

In [None]:
!pip install -Uq tensorflow-recommenders
!pip install -Uq tensorflow-datasets

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Data Preparation

In [None]:
# Load ecommerce dataset
df_ecommerce = pd.read_csv('ecommerce.csv') # Contains transactions
df_products = pd.read_csv('products.csv')   # Contains products

In [None]:
# Convert dataframe to tensors
ds_ecommerce = tf.data.Dataset.from_tensor_slices(dict(df_ecommerce))
ds_products = tf.data.Dataset.from_tensor_slices(dict(df_products))

In [None]:
# Select fields
ds_ecommerce = ds_ecommerce.map(lambda x: {
    'user_id': tf.strings.as_string(x['user_id']),
    'product_id': tf.strings.as_string(x['product_id']),
    'age': x['age'],
    'search_query': x['search_query']
})

ds_products = ds_products.batch(32).map(lambda x: tf.strings.as_string(x['product_id']))

# Get all the user IDs
user_ids = ds_ecommerce.batch(1000000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_ecommerce.batch(1000000).map(lambda x: x["product_id"])
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

# Module 01 - Two-Towers (IDs Only)

In [None]:
# User and Product models.
class UserModel(tf.keras.Model):

  def __init__(self, unique_user_ids):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])

  def call(self, inputs):
    return self.user_embedding(inputs)

class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model(features["user_id"])
    positive_product_embeddings = self.product_model(features["product_id"])

    return self.task(user_embeddings, positive_product_embeddings)

# Instantiate and compile the model.
user_model = UserModel(unique_user_ids)
product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = tf.data.Dataset.from_tensor_slices(unique_product_ids).batch(128).map(product_model)

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
# Train for 3 epochs.
model.fit(ds_ecommerce.batch(50), epochs=1)



<keras.callbacks.History at 0x7f442eb06f20>

In [None]:
K = 3

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Populate the index with the embeddings from the product model. And, perform this
# operation in batch with a size of 32 observations for efficiency.
product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# # Get some recommendations.
user_id = "1"
_, products = index(np.array([user_id]))
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")

Top 3 recommendations for user 1: [390  56 217]


# Module 02 - Two-Towers w/ Meta Data

In [None]:
# User and Product models.
class ProductModel(tf.keras.Model):

  def __init__(self, unique_product_ids):
    super().__init__()

    self.product_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_product_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_product_ids) + 1, 32)
    ])

  def call(self, inputs):
    return self.product_embedding(inputs)

class UserModelMetaData(tf.keras.Model):

  def __init__(self, unique_user_id_list):
    super().__init__()

    # User ID Embedding
    self.user_embedding = tf.keras.Sequential([
      tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_id_list, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_id_list) + 1, 32),
    ])

    # User age
    self.age_normalizer = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)

  def call(self, inputs):
    user_emb = self.user_embedding(inputs['user_id'])
    age = self.age_normalizer(inputs['age'])
    return tf.concat([user_emb, tf.reshape(age, (-1, 1))], axis=1)

  def adapt(self, data):
    age_data = data.map(lambda x: x['age'])
    self.age_normalizer.adapt(age_data)

# Define the two-tower model.
class TwoTowerModel(tfrs.models.Model):

  def __init__(self, user_model, product_model, task):
    super().__init__()
    self.user_model = tf.keras.Sequential([
      user_model,
      tf.keras.layers.Dense(32)
    ])
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features, training=False):
    user_embeddings = self.user_model({
        'user_id': features['user_id'],
        'age': features['age']
    })
    positive_product_embeddings = self.product_model(features['product_id'])
    return self.task(user_embeddings, positive_product_embeddings)

# You need to gather the unique user ids and product ids to instantiate the models.
user_ids = ds_ecommerce.batch(1000000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids))).astype(str)

# Get all the product IDs
product_ids = ds_ecommerce.batch(1000000).map(lambda x: x["product_id"])
unique_product_ids = np.unique(np.concatenate(list(product_ids))).astype(str)

# Instantiate and compile the model.
user_model = UserModelMetaData(unique_user_ids)
# user_model.adapt(ds_ecommerce)

product_model = ProductModel(unique_product_ids)

# Calculate embeddings for all products.
product_embeddings = ds_products.map(product_model)

# Specify the task.
task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(candidates=product_embeddings)
)

model = TwoTowerModel(user_model, product_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [None]:
# Train for 3 epochs.
model.fit(ds_ecommerce.batch(50), epochs=1)





<keras.callbacks.History at 0x7f4427ccdba0>

In [None]:
K = 3

# Get some recommendations.
user_id = tf.constant(["1"])  # user_id should be a string tensor
user_age = tf.constant([25])  # user_age should be a numeric tensor

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=K)

# Add candidates in the index
product_embeddings_to_index = ds_products.map(lambda id: model.product_model(id))
index.index_from_dataset(product_embeddings_to_index)

# You need to pass the user_id and age in a dictionary.
_, products = index({"user_id": user_id, "age": user_age})
print(f"Top 3 recommendations for user {user_id}: {products[0, :K]}")



Top 3 recommendations for user [b'1']: [ 50 282 177]
