In [2]:
!pip install tensorflow_recommenders



In [3]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

from google.colab import drive
from pandas.api.types import is_numeric_dtype
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns

Data setup

In [4]:
drive.mount('/content/drive')
!cp '/content/drive/MyDrive/Machine Learning/recommender/KaDo.h5' -d '/content/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
DATASET_HDF5_PATH = '/content/KaDo.h5'

In [6]:
df = pd.read_hdf(DATASET_HDF5_PATH)

In [7]:
df.head()

Unnamed: 0,TICKET_ID,MOIS_VENTE,PRIX_NET,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID
0,35592159,10,1.67,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281
1,35592159,10,1.66,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281
2,35592159,10,7.45,SOINS DU VISAGE,VIS_CJOUR Jeunes Specifique,VIS_JEUNE_ET_LEVRE,CR JR PARF BIO.SPE AC.SENT.50ML,1490281
3,35592159,10,5.95,SOINS DU VISAGE,VIS_DEMAQ AAAR,VIS_AAAR_DEMAQLOTION,EAU MICELLAIRE 3 THES FL200ML,1490281
4,35592159,10,1.67,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 TIARE FL 200ML,1490281


In [8]:
df.dtypes

TICKET_ID       uint32
MOIS_VENTE       uint8
PRIX_NET       float32
FAMILLE       category
UNIVERS       category
MAILLE        category
LIBELLE       category
CLI_ID          uint32
dtype: object

In [9]:
df[['FAMILLE', 'UNIVERS', 'MAILLE', 'LIBELLE']].drop_duplicates().count()

FAMILLE    1484
UNIVERS    1484
MAILLE     1484
LIBELLE    1484
dtype: int64

In [10]:
df['FAMILLE'] = df['FAMILLE'].astype(str)
df['UNIVERS'] = df['UNIVERS'].astype(str)
df['MAILLE'] = df['MAILLE'].astype(str)
df['LIBELLE'] = df['LIBELLE'].astype(str)

In [11]:
client_id_lookup = tf.keras.layers.IntegerLookup()
client_id_lookup.adapt(df['CLI_ID'].unique())

month_lookup = tf.keras.layers.IntegerLookup()
month_lookup.adapt(df['MOIS_VENTE'].unique())

In [12]:
product_family_lookup = tf.keras.layers.StringLookup()
product_family_lookup.adapt(df['FAMILLE'].unique())

product_universe_lookup = tf.keras.layers.StringLookup()
product_universe_lookup.adapt(df['UNIVERS'].unique())

product_mesh_lookup = tf.keras.layers.StringLookup()
product_mesh_lookup.adapt(df['MAILLE'].unique())

product_label_lookup = tf.keras.layers.StringLookup()
product_label_lookup.adapt(df['LIBELLE'].unique())



In [13]:
print(f"Vocabulary: {product_label_lookup.get_vocabulary()[:4]}")

Vocabulary: ['[UNK]', 'x99SS GELEE FRUITS VANIL PN2 10ML', 'ZERO TRACE SPRAY SPF 30 150ML', 'ZERO TRACE SPRAY SPF 15 FL150']


In [14]:
class ClientModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        self.client_embedding = tf.keras.Sequential([
            client_id_lookup,
            tf.keras.layers.Embedding(client_id_lookup.vocabulary_size(), 64),
        ])

        self.month_embedding = tf.keras.Sequential([
            month_lookup,
            tf.keras.layers.Embedding(month_lookup.vocabulary_size(), 64)
        ])

    def call(self, inputs):

        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result.
        return tf.concat([
            self.client_embedding(inputs["CLI_ID"]),
            self.month_embedding(inputs["MOIS_VENTE"])
        ], axis=1)

In [15]:
class ProductModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.label_embedding = tf.keras.Sequential([
            product_label_lookup,
            tf.keras.layers.Embedding(product_label_lookup.vocabulary_size(), 32)
        ])

        self.family_embedding = tf.keras.Sequential([
            product_family_lookup,
            tf.keras.layers.Embedding(product_family_lookup.vocabulary_size(), 32)
        ])

        self.universe_embedding = tf.keras.Sequential([
            product_universe_lookup,
            tf.keras.layers.Embedding(product_universe_lookup.vocabulary_size(), 32)
        ])

        self.mesh_embedding = tf.keras.Sequential([
            product_mesh_lookup,
            tf.keras.layers.Embedding(product_mesh_lookup.vocabulary_size(), 32)
        ])

    def call(self, inputs):
        return tf.concat([
            self.label_embedding(inputs['LIBELLE']),
            self.family_embedding(inputs['FAMILLE']),
            self.universe_embedding(inputs['UNIVERS']),
            self.mesh_embedding(inputs['MAILLE']),
        ], axis=1)

In [16]:
product_df = df[['FAMILLE', 'UNIVERS', 'MAILLE', 'LIBELLE']].drop_duplicates()
product_ds = tf.data.Dataset.from_tensor_slices(dict(product_df))

purchase_history_df = tf.data.Dataset.from_tensor_slices(dict(df))

In [17]:
class ProductRecommenderModel(tfrs.Model):

  def __init__(self, client_model, product_model):
    super().__init__()
    self.client_model: tf.keras.Model = client_model
    self.product_model: tf.keras.Model = product_model
    self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=product_ds.batch(128).map(self.product_model)
        )
    )

  def compute_loss(self, features, training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    client_embeddings = self.client_model(features)
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    product_embeddings = self.product_model(features)

    # The task computes the loss and the metrics.
    return self.task(client_embeddings, product_embeddings)

In [18]:
tf.random.set_seed(42)
shuffled = purchase_history_df.shuffle(8_000_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(6_000_000)
test = shuffled.skip(6_000_000).take(2_000_000)

In [19]:
client_model = ClientModel()
product_model = ProductModel()
model = ProductRecommenderModel(client_model, product_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [20]:
cached_train = train.shuffle(6_000_000).batch(512).cache()
cached_test = test.batch(4096).cache()

In [21]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f0a2f0baed0>

In [None]:
model.evaluate(cached_test, return_dict=True)