# Entraînement des modèles

- Ce notebook prépare les données pour l'inférence en enregistrant les features dand
- This notebook prepares the data for inference: features are saved in the feature store and item embeddings are stored in faiss.
- On entraine également le modèle de retrieval et de scoring pour l'inférence

In [1]:
import os
import gc
import glob
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

import nvtabular as nvt
from nvtabular.ops import *
import numpy as np

from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.schema.tags import Tags
from merlin.models.utils.dataset import unique_rows_by_features

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
import tensorflow as tf

2024-09-02 12:37:02.504438: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 12:37:02.563192: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.




  warn(f"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}")


[SOK INFO] Import /usr/local/lib/python3.10/dist-packages/merlin_sok-2.0.0-py3.10-linux-x86_64.egg/sparse_operation_kit/lib/libsparse_operation_kit.so
[SOK INFO] Initialize finished, communication tool: horovod


2024-09-02 12:37:05.968085: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0
2024-09-02 12:37:05.968293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 40503 MB memory:  -> device: 0, name: NVIDIA H100 80GB HBM3, pci bus id: 0000:e4:00.0, compute capability: 9.0
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# disable INFO and DEBUG logging everywhere
import logging
logging.disable(logging.WARNING)

In [3]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/root/Data/Row/")
DATA_FOLDER = os.environ.get("DATA_FOLDER", "/root/Data/")
MODELS_FOLDER = os.environ.get("MODELS", "/root/Models/")
PROCESSED_FOLDER = os.environ.get("PROCESSED_FOLDER", "/root/Data/Processed/")
feature_repo_path = os.environ.get("FEAST_PATH", "/root/Data/feast_repo/feature_repo")

BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 512))
from merlin.core.dispatch import get_lib
df_lib = get_lib()
df_lib

<module 'cudf' from '/usr/local/lib/python3.10/dist-packages/cudf/__init__.py'>

## Chargement des datasets issues du notebook 01

In [4]:
train_numeric = Dataset(os.path.join(PROCESSED_FOLDER, "train_numeric", "*.parquet"), part_size="128MB")
test_numeric = Dataset(os.path.join(PROCESSED_FOLDER, "test_numeric", "*.parquet"), part_size="128MB")
#dataset = Dataset(os.path.join(PROCESSED_FOLDER, "dataset_numeric", "*.parquet"))



In [6]:
train_numeric.compute().to_csv(os.path.join(DATA_FOLDER, "Plot/train_numeric"))
test_numeric.compute().to_csv(os.path.join(DATA_FOLDER, "Plot/test_numeric"))

## Export des features cliens et produits

### Préparation du Feast feature store

In [2]:
# for running this example on GPU, install the following libraries
#%pip install "feast==0.31" faiss-gpu

# for running this example on CPU, uncomment the following lines
# %pip install tensorflow-cpu "feast==0.31" faiss-cpu
# %pip uninstall cudf

In [20]:
#!rm -rf $DATA_FOLDER/feast_repo
!cd $DATA_FOLDER && feast init feast_repo

  from distutils.dir_util import copy_tree

Creating a new Feast repository in [1m[32m/root/Data/feast_repo[0m.



In [21]:
feature_repo_path = os.path.join(DATA_FOLDER, "feast_repo/feature_repo")
if os.path.exists(f"{feature_repo_path}/example_repo.py"):
    os.remove(f"{feature_repo_path}/example_repo.py")
if os.path.exists(f"{feature_repo_path}/data/driver_stats.parquet"):
    os.remove(f"{feature_repo_path}/data/driver_stats.parquet")

### Extraction des features

In [22]:
from datetime import datetime

#Extract unique rows for users
user_features = (
    unique_rows_by_features(train_numeric, Tags.USER, Tags.USER_ID)
    .compute()
    .reset_index(drop=True)
)

user_features["datetime"] = datetime.now()
user_features["datetime"] = user_features["datetime"].astype("datetime64[ns]")
user_features["created"] = datetime.now()
user_features["created"] = user_features["created"].astype("datetime64[ns]")

#Store in feast folder
user_features.to_parquet(
    os.path.join(feature_repo_path, "data", "user_features.parquet")
)


## Same for Items
item_features = (
    unique_rows_by_features(train_numeric, Tags.ITEM, Tags.ITEM_ID)
    .compute()
    .reset_index(drop=True)
)
item_features["datetime"] = datetime.now()
item_features["datetime"] = item_features["datetime"].astype("datetime64[ns]")
item_features["created"] = datetime.now()
item_features["created"] = item_features["created"].astype("datetime64[ns]")

item_features.to_parquet(
    os.path.join(feature_repo_path, "data", "item_features.parquet")
)

item_features.head()

Unnamed: 0,item_id,product_code,prod_name,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,detail_desc,count_30d_purchased,count_7d_purchased,Time_Weighted_Purchased,datetime,created
0,108775044,108775,7015,253,3,1010016,10,3,9,1676,3,1,16,1002,6407,20,8,15.539194,2024-08-30 11:45:29.605883,2024-08-30 11:45:29.606841
1,111565001,111565,2190,304,9,1010016,9,4,5,3608,5,1,62,1021,2116,73,32,18.450956,2024-08-30 11:45:29.605883,2024-08-30 11:45:29.606841
2,111586001,111586,1088,273,4,1010016,9,4,5,3608,5,1,62,1021,1086,125,82,44.397659,2024-08-30 11:45:29.605883,2024-08-30 11:45:29.606841
3,111593001,111593,1157,304,9,1010016,9,4,5,3608,5,1,62,1021,1150,140,97,40.496376,2024-08-30 11:45:29.605883,2024-08-30 11:45:29.606841
4,111609001,111609,3122,304,9,1010016,9,4,5,3608,5,1,62,1021,2971,58,31,10.355453,2024-08-30 11:45:29.605883,2024-08-30 11:45:29.606841


In [23]:
user_features.head(2)

Unnamed: 0,user_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,recency,frequency,amount,...,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type,datetime,created
0,3,3,3,3,3,21,5,75,0.17,0.018846,...,1313,7854,11,48,618287,794389,265,265,2024-08-30 11:45:29.404242,2024-08-30 11:45:29.440340
1,4,4,4,3,4,24,30,20,1.16,0.019971,...,1322,1343,15,53,829017,764895,265,265,2024-08-30 11:45:29.404242,2024-08-30 11:45:29.440340


### Définition des features pour Feast

In [24]:
item_features.dtypes

item_id                                int32
product_code                           int32
prod_name                              int32
product_type_no                        int32
product_group_name                     int32
graphical_appearance_no                int32
colour_group_code                      int32
perceived_colour_value_id              int32
perceived_colour_master_id             int32
department_no                          int32
index_code                             int32
index_group_no                         int32
section_no                             int32
garment_group_no                       int32
detail_desc                            int32
count_30d_purchased                    int32
count_7d_purchased                     int32
Time_Weighted_Purchased              float32
datetime                      datetime64[ns]
created                       datetime64[ns]
dtype: object

In [25]:
file = open(os.path.join(feature_repo_path, "item_features.py"), "w")
file.write(
        """
from datetime import timedelta
from feast import Entity, Field, FeatureView, ValueType
from feast.types import Int32, Float32
from feast.infra.offline_stores.file_source import FileSource

item_features = FileSource(
    path="{}",
    timestamp_field="datetime",
    created_timestamp_column="created",
)

item = Entity(name="item_id", value_type=ValueType.INT32, join_keys=["item_id"],)

item_features_view = FeatureView(
    name="item_features",
    entities=[item],
    ttl=timedelta(0),
    schema=[
        Field(name="product_code", dtype=Int32),
        Field(name="prod_name", dtype=Int32),
        Field(name="product_type_no", dtype=Int32),
        Field(name="product_group_name", dtype=Int32),
        Field(name="graphical_appearance_no", dtype=Int32),
        Field(name="colour_group_code", dtype=Int32),
        Field(name="perceived_colour_value_id", dtype=Int32),
        Field(name="perceived_colour_master_id", dtype=Int32),
        Field(name="department_no", dtype=Int32),
        Field(name="index_code", dtype=Int32),
        Field(name="index_group_no", dtype=Int32),
    	Field(name="section_no", dtype=Int32),
    	Field(name="garment_group_no", dtype=Int32),
        Field(name="detail_desc", dtype=Int32),
        Field(name="count_30d_purchased", dtype=Int32),
        Field(name="count_7d_purchased", dtype=Int32),
        Field(name="Time_Weighted_Purchased", dtype=Float32),
    ],
    online=True,
    source=item_features,
    tags=dict(),
)
""".format(
            os.path.join(feature_repo_path, "data/", "item_features.parquet")
        )
    )
file.close()

In [26]:
user_features.dtypes

user_id                               int32
FN                                    int32
Active                                int32
club_member_status                    int32
fashion_news_frequency                int32
age                                   int32
postal_code                           int32
recency                               int32
frequency                           float32
amount                              float32
popular_product_type                  int32
2nd_popular_product_type              int32
popular_department_no                 int32
2nd_popular_department_no             int32
popular_section_no                    int32
2nd_popular_section_no                int32
last_product_code                     int32
2nd_last_product_code                 int32
last_product_type                     int32
2nd_last_product_type                 int32
datetime                     datetime64[ns]
created                      datetime64[ns]
dtype: object

In [27]:
f = open(os.path.join(feature_repo_path, "user_features.py"), "w")
f.write(
    """
from datetime import timedelta
from feast import Entity, Field, FeatureView, ValueType
from feast.types import Int32, Float32
from feast.infra.offline_stores.file_source import FileSource

user_features = FileSource(
    path="{}",
    timestamp_field="datetime",
    created_timestamp_column="created",
)

user = Entity(name="user_id", value_type=ValueType.INT32, join_keys=["user_id"],)

user_features_view = FeatureView(
    name="user_features",
    entities=[user],
    ttl=timedelta(0),
    schema=[
        Field(name="FN", dtype=Int32),
        Field(name="Active", dtype=Int32),
        Field(name="club_member_status", dtype=Int32),
        Field(name="fashion_news_frequency", dtype=Int32),
        Field(name="age", dtype=Int32),
        Field(name="postal_code", dtype=Int32),
        Field(name="recency", dtype=Int32),
        Field(name="frequency", dtype=Float32),
        Field(name="amount", dtype=Float32),
        Field(name="popular_product_type", dtype=Int32),
        Field(name="2nd_popular_product_type", dtype=Int32),
        Field(name="popular_department_no", dtype=Int32),
        Field(name="2nd_popular_department_no", dtype=Int32),
        Field(name="popular_section_no", dtype=Int32),
        Field(name="2nd_popular_section_no", dtype=Int32),
        Field(name="last_product_code", dtype=Int32),
        Field(name="2nd_last_product_code", dtype=Int32),
        Field(name="last_product_type", dtype=Int32),
        Field(name="2nd_last_product_type", dtype=Int32),
    ],
    online=True,
    source=user_features,
    tags=dict(),
)
""".format(
        os.path.join(feature_repo_path, "data/", "user_features.parquet")
    )
)
f.close()

## Pre-processing avec Nvtabular

Définition des features et des workflows

#Item columns for Nvtabular workflow
cat_item_features = ['product_code', 'prod_name', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'product_group_name',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased'] #'product_group_name'
float_item_features = ['Time_Weighted_Purchased']

#User columns
cat_user_features = ['FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code', 'popular_product_type', '2nd_popular_product_type',
       'popular_department_no', '2nd_popular_department_no',
       'popular_section_no', '2nd_popular_section_no', 'last_product_code',
       '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type', 'recency']
float_user_features = ['frequency', 'amount']

In [4]:
#Item columns for Nvtabular workflow
cat_item_features = ['product_code', 'prod_name', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'product_group_name',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc']
float_item_features = ['Time_Weighted_Purchased', 'count_7d_purchased', 'count_30d_purchased']

#User columns
cat_user_features = ['FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'postal_code', 'popular_product_type', '2nd_popular_product_type',
       'popular_department_no', '2nd_popular_department_no',
       'popular_section_no', '2nd_popular_section_no', 'last_product_code',
       '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type']
float_user_features = ['age', 'frequency', 'amount', 'recency']

In [5]:
#train_numeric.schema

In [10]:
%%time
from merlin.dag.ops.subgraph import Subgraph
category_temp_directory = os.path.join(DATA_FOLDER, "Processed/categories")


#Adding tags and categorify
user_id = ["user_id"] >> TagAsUserID() >> AddTags(tags='USER_ID')
user_features_cat = cat_user_features >> Categorify(dtype='int32', out_path=category_temp_directory) >> TagAsUserFeatures() >> AddTags(tags='CATEGORICAL')
user_features_float = float_user_features >> Normalize() >> ReduceDtypeSize() >> TagAsUserFeatures() >> AddTags(tags='CONTINUOUS')

item_id = ["item_id"] >> TagAsItemID() >> AddTags(tags='ITEM_ID')
item_features_cat = cat_item_features  >>  Categorify(dtype='int32', out_path=category_temp_directory) >> TagAsItemFeatures() >> AddTags(tags='CATEGORICAL')  
item_features_float = float_item_features >> Normalize() >> ReduceDtypeSize() >> TagAsItemFeatures() >> AddTags(tags='CONTINUOUS')

subgraph_item = Subgraph(
     "item", 
     Subgraph("item_id_sg", item_id) + 
    Subgraph("item_features_cat_sg", item_features_cat) + 
    Subgraph("item_features_float_sg", item_features_float))

subgraph_user = Subgraph(
     "user", 
     Subgraph("user_id_sg", user_id) + 
    Subgraph("user_features_cat_sg", user_features_cat) +
    Subgraph("user_features_float_sg", user_features_float))

targets = ["Target"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, "target"])

outputs = subgraph_item + subgraph_user + targets
#outputs = outputs >> Dropna()

# Create and run workflow
workflow = nvt.Workflow(outputs)


workflow.fit_transform(train_numeric).to_parquet(DATA_FOLDER+'/Processed/train_processed')
workflow.transform(test_numeric).to_parquet(DATA_FOLDER+'/Processed/test_processed')

CPU times: user 7.01 s, sys: 1.38 s, total: 8.39 s
Wall time: 10.6 s


Enregistrement du workflow pour Triton plus tard

In [11]:
workflow.save(os.path.join(MODELS_FOLDER, "general_workflow"))

On peut supprimer les datasets dont on n'a plus besoin

In [16]:
import shutil
shutil.rmtree('/root/Data/Processed/dataset_numeric')
shutil.rmtree('/root/Data/Processed/train_numeric')
shutil.rmtree('/root/Data/Processed/test_numeric')

## Entraînement du modèle de Retrieval

In [5]:
train_processed = Dataset(os.path.join(PROCESSED_FOLDER, "train_processed", "*.parquet"))
test_processed = Dataset(os.path.join(PROCESSED_FOLDER, "test_processed", "*.parquet"))

In [55]:
#train_processed.schema

On ne sélectionne que les interactions positives car pour le two-tower model, le negative sampling se fait en batch

In [14]:
inputs = train_processed.schema.column_names
outputs = inputs >> Filter(f=lambda df: df["Target"] == 1)

nvt_wkflow = nvt.Workflow(outputs)
nvt_wkflow.fit(train_processed)

nvt_wkflow.transform(train_processed).to_parquet(
    output_path=os.path.join(PROCESSED_FOLDER, "train_tt_processed")
)

nvt_wkflow.transform(test_processed).to_parquet(
    output_path=os.path.join(PROCESSED_FOLDER, "test_tt_processed")
)

In [15]:
train_tt = Dataset(os.path.join(PROCESSED_FOLDER, "train_tt_processed", "*.parquet"), part_size="500MB")
test_tt = Dataset(os.path.join(PROCESSED_FOLDER, "test_tt_processed", "*.parquet"), part_size="500MB")

In [16]:
train_tt.schema.select_by_tag(Tags.ITEM_ID)

Unnamed: 0,name,tags,dtype,is_list,is_ragged
0,item_id,"(Tags.ITEM, Tags.ID)","DType(name='int32', element_type=<ElementType....",False,False


In [17]:
#train_tt.schema.excluding_by_tag(Tags.ITEM_ID)

from merlin.schema.schema import Domain, ColumnSchema, Schema

# Obtenez la colonne actuelle
current_item_id_col = train_tt.schema['item_id']

# Créez un nouveau domaine
new_domain = Domain(min=0, max=999999)  # Ajustez ces valeurs selon vos besoins

# Créez une nouvelle instance de ColumnSchema avec le nouveau domaine
new_item_id_col = ColumnSchema(
    name="item_id",
    dtype=current_item_id_col.dtype,
    tags=current_item_id_col.tags + [Tags.ITEM_ID],  # Assurez-vous que Tags.ITEM_ID est présent
    properties={
        **current_item_id_col.properties,
        "domain": new_domain.__dict__,
        "is_categorical": True
    },
    is_list=current_item_id_col.is_list,
    is_ragged=current_item_id_col.is_ragged
)


In [97]:
new_item_id_col

ColumnSchema(name='item_id', tags={<Tags.ID: 'id'>, <Tags.ITEM: 'item'>}, properties={'domain': {'min': 0, 'max': 999999, 'name': None}, 'is_categorical': True}, dtype=DType(name='int32', element_type=<ElementType.Int: 'int'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=None)), is_list=False, is_ragged=False)

schema_without_id = train_tt.schema.excluding_by_tag(Tags.ITEM_ID)
# Créez un nouveau schéma en supprimant l'ancienne colonne 'item_id' et en ajoutant la nouvelle
updated_schema = (schema_without_id + Schema([new_item_id_col])  # Ajoute la nouvelle colonne 'item_id'
)
updated_schema

In [19]:
#schema = train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER])
schema =train_tt.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER])
train_tt.schema = schema
test_tt.schema = schema

In [56]:
#schema

In [22]:
tower_dim = 64 

# create user schema using USER tag
user_schema = schema.select_by_tag(Tags.USER)
# create user (query) tower input block
user_inputs = mm.InputBlockV2(user_schema)

# create user (query) encoder block
query = mm.Encoder(user_inputs, mm.MLPBlock([128, tower_dim], no_activation_last_layer=True))

# create item schema using ITEM tag
item_schema = schema.select_by_tag(Tags.ITEM)
# create item (candidate) tower input block
item_inputs = mm.InputBlockV2(item_schema)
# create item (candidate) encoder block
candidate = mm.Encoder(item_inputs, mm.MLPBlock([128, tower_dim], no_activation_last_layer=True))

In [42]:
model_tt = mm.TwoTowerModelV2(query, candidate)
model_tt.compile(optimizer="adam", run_eagerly=False, loss="categorical_crossentropy", metrics=[mm.RecallAt(10), mm.NDCGAt(10)])

In [34]:
4096//2

2048

In [44]:
%%time

model_tt.fit(train_tt, validation_data=test_tt, batch_size=4096//4, epochs=5)

Epoch 1/5


2024-08-30 09:57:10.670560: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2024-08-30 09:57:26.940416: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 3min 22s, sys: 27.4 s, total: 3min 49s
Wall time: 1min 32s


<keras.callbacks.History at 0x7f6eb8296b60>

In [27]:
import shutil
shutil.rmtree('/root/Data/Processed/test_tt_processed')
shutil.rmtree('/root/Data/Processed/train_tt_processed')

Sauvegarde du modèle de retrieval

In [45]:
query_tower = model_tt.query_encoder
query_tower.save(os.path.join(MODELS_FOLDER, "query_tower"))

### Sauvegarde des embeddings produits

In [57]:
item_features = Dataset(os.path.join(feature_repo_path, "data/item_features.parquet"))

In [58]:
item_features.schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged
0,item_id,(),"DType(name='int32', element_type=<ElementType....",False,False
1,product_code,(),"DType(name='int32', element_type=<ElementType....",False,False
2,prod_name,(),"DType(name='int32', element_type=<ElementType....",False,False
3,product_type_no,(),"DType(name='int32', element_type=<ElementType....",False,False
4,product_group_name,(),"DType(name='int32', element_type=<ElementType....",False,False
5,graphical_appearance_no,(),"DType(name='int32', element_type=<ElementType....",False,False
6,colour_group_code,(),"DType(name='int32', element_type=<ElementType....",False,False
7,perceived_colour_value_id,(),"DType(name='int32', element_type=<ElementType....",False,False
8,perceived_colour_master_id,(),"DType(name='int32', element_type=<ElementType....",False,False
9,department_no,(),"DType(name='int32', element_type=<ElementType....",False,False


In [59]:
List_item_features = ['item_id', 'product_code', 'prod_name', 'product_type_no',
       'product_group_name', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchased']
print(len(List_item_features))

18


On fait passer tous les items dans la branche item du two-tower model pour pré-calculer les embeddings

In [60]:
from nvtabular import Workflow

#nvt_workflow = Workflow.load(os.path.join(MODELS_FOLDER, 'general_workflow'))
item_subgraph = workflow.get_subworkflow("item")
#user_features = user_attributes >> TransformWorkflow(user_subgraph)

In [61]:
from merlin.systems.dag.ops.tensorflow import PredictTensorflow
from merlin.systems.dag.ops.workflow import TransformWorkflow

workflow_item_embedding =  nvt.Workflow(["item_id"] + (List_item_features >> TransformWorkflow(item_subgraph) >> PredictTensorflow(model_tt.candidate_encoder)))
item_embeddings = workflow_item_embedding.fit_transform(item_features).to_ddf().compute()
item_embeddings.tail()

Unnamed: 0,item_id,output_1
23412,952267001,"[2.6257996559143066, 1.4570021629333496, 0.575..."
23413,952938001,"[0.9029256105422974, 0.8305089473724365, 0.147..."
23414,953450001,"[2.418158769607544, 0.5666717290878296, 0.2449..."
23415,953763001,"[0.5645731091499329, 0.3132047653198242, 0.367..."
23416,956217002,"[1.4344825744628906, 0.17341676354408264, -0.0..."


Sauvegarde des embeddings

In [62]:
# save to disk
item_embeddings.to_parquet(os.path.join(PROCESSED_FOLDER, "item_embeddings.parquet"))

## Entraînement du modèle de scoring

In [6]:
# define schema object
schema_dlrm = train_processed.schema

In [7]:
target_column = schema_dlrm.select_by_tag(Tags.TARGET).column_names[0]
target_column

'Target'

In [8]:
batch_size = 16 * 1024
LR = 0.03
metrics = [tf.keras.metrics.AUC(name="auc"),tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')] #mm.RecallAt(10), mm.NDCGAt(10), mm.PrecisionAt(10)]

In [71]:
model = mm.DLRMModel(
    schema_dlrm,
    embedding_dim=64,
    bottom_block=mm.MLPBlock([128, 64]),
    top_block=mm.MLPBlock([128, 64, 32]),
    prediction_tasks=mm.BinaryOutput('Target'),
)


In [77]:
%%time
opt = tf.keras.optimizers.legacy.Adagrad(learning_rate=LR*4)
model.compile(optimizer=opt, run_eagerly=False, metrics=metrics)
model.fit(train_processed, validation_data=test_processed, batch_size=batch_size, epochs=8)

Epoch 1/8


2024-08-30 10:19:59.852223: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2024-08-30 10:20:07.488766: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 48.4 s, sys: 7.96 s, total: 56.3 s
Wall time: 32.4 s


<keras.callbacks.History at 0x7f6e6a6937f0>

In [31]:
from merlin.models.utils.example_utils import workflow_fit_transform, save_results

results_path = os.path.join( "results_models/results.txt")
if os.path.isfile(results_path):
    os.remove(results_path)

def save_results_custom(model_name, model, out_path):
    """a funct to save validation accuracy results in a text file"""
    with open(out_path, "a") as f:
        f.write(model_name)
        f.write("\n")
        for key, value in model.history.history.items():
            if "val_auc" in key:
                f.write("%s:%s\n" % (key, value[0]))
            if "val_loss" in key:
                f.write("%s:%s\n" % (key, value[0]))
            if "val_precision" in key:
                f.write("%s:%s\n" % (key, value[0]))
            if "val_recall" in key:
                f.write("%s:%s\n" % (key, value[0]))


In [151]:
save_results_custom("DLRM", model, results_path)

Enregistrement du modèle

In [78]:
model.save(os.path.join(MODELS_FOLDER, "dlrm"))

## Test NCF

In [11]:
schema = schema_dlrm

Test Wide & Deep

In [80]:
cat_schema = schema.select_by_tag(Tags.CATEGORICAL)

one_hot_encoding = mm.CategoryEncoding(cat_schema, sparse=True, output_mode="one_hot") # One-hot encoding

features_crossing = mm.HashedCrossAll(
        cat_schema,
        num_bins=100, # The crossed features will be hashed to this number of bins
        max_level=2,
        output_mode="one_hot",
        sparse=True,
        ignore_combinations=[["item_id", "item_category"], 
                             ["item_id", "item_brand"]]
    )

wide_preprocessing_blocks = mm.ParallelBlock([
                                              one_hot_encoding, 
                                              features_crossing
                                             ],
                                             aggregation="concat")

deep_part = mm.MLPBlock([128, 64, 32])

model = mm.WideAndDeepModel(
        schema,
        wide_schema=cat_schema,
        deep_schema=schema,
        wide_preprocess=wide_preprocessing_blocks,
        deep_block=deep_part,
        prediction_tasks=mm.BinaryOutput(target_column),
    )

In [82]:
%%time
opt = tf.keras.optimizers.legacy.Adagrad(learning_rate=LR*5)
model.compile(optimizer=opt, run_eagerly=False, metrics=metrics)
model.fit(train_processed, validation_data=test_processed, batch_size=batch_size//2, epochs=8)

Epoch 1/8


2024-08-30 10:29:30.567060: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2024-08-30 10:31:23.123396: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 47min 19s, sys: 4min 10s, total: 51min 30s
Wall time: 9min 21s


<keras.callbacks.History at 0x7f6e11ab63e0>

val_loss: 0.5028 - val_auc: 0.8365 - val_precision: 0.7053 - val_recall: 0.8006 - val_regularization_loss: 0.0000e+00 - val_loss_batch: 0.4982

In [16]:
#save_results_custom("Wide&Deep", model, results_path)

In [41]:
#model.save(os.path.join(MODELS_FOLDER, "wandd"))

In [12]:
model = mm.DCNModel(
    schema,
    depth=1,
    deep_block=mm.MLPBlock([64, 32]),
    prediction_tasks=mm.BinaryOutput(target_column),
)

In [13]:
%%time

opt = tf.keras.optimizers.legacy.Adagrad(learning_rate=LR*10)
model.compile(optimizer=opt, run_eagerly=False, metrics=metrics)
model.fit(train_processed, validation_data=test_processed, batch_size=batch_size//2, epochs=8)

2024-08-30 10:59:55.908968: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:655] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 1/8


2024-08-30 10:59:56.495471: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2024-08-30 11:00:06.801028: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 1min 15s, sys: 12 s, total: 1min 27s
Wall time: 45.7 s


<keras.callbacks.History at 0x7f530814c3d0>

In [162]:
save_results_custom("DCN", model, results_path)

In [55]:
model.save(os.path.join(MODELS_FOLDER, "DCN"))

In [14]:
# uses default embedding_dim = 64
model = mm.Model.from_block(mm.MLPBlock([64, 32]),
    schema, prediction_tasks=mm.BinaryOutput(target_column)
)


In [15]:
%%time

opt = tf.keras.optimizers.legacy.Adagrad(learning_rate=LR*4)
model.compile(optimizer=opt, run_eagerly=False, metrics=metrics)
model.fit(train_processed, validation_data=test_processed, batch_size=batch_size, epochs=10)



Epoch 1/10


2024-08-30 11:02:33.962389: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




2024-08-30 11:02:40.259799: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 50.1 s, sys: 9.65 s, total: 59.8 s
Wall time: 34.7 s


<keras.callbacks.History at 0x7f54065df1c0>