# Création du dataset supervisé

- Ce notebook a pour objectif de créer un dataset supervisé pour entraîner l'algorithme de scoring
- Les clients qui n'ont jamais achetés de produits sont donc supprimés (cold start customers) et peuvent être traités séparrément (ex: recommandation des produits les plus populaires)
- Les données sont converties au format numeric car l'opérateur QueryFeast de Triton ne supporte pas les String pour le moment

In [1]:
import os
import gc
import glob
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

import nvtabular as nvt
from nvtabular.ops import *
import numpy as np

from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.schema.tags import Tags
from merlin.models.utils.dataset import unique_rows_by_features

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
import tensorflow as tf

2024-08-30 09:16:35.435969: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-30 09:16:35.483384: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  warn(f"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}")


[SOK INFO] Import /usr/local/lib/python3.10/dist-packages/merlin_sok-2.0.0-py3.10-linux-x86_64.egg/sparse_operation_kit/lib/libsparse_operation_kit.so
[SOK INFO] Initialize finished, communication tool: horovod


2024-08-30 09:16:37.868850: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0
2024-08-30 09:16:37.868989: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 40503 MB memory:  -> device: 0, name: NVIDIA H100 80GB HBM3, pci bus id: 0000:e4:00.0, compute capability: 9.0
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Disable INFO and DEBUG logging everywhere
import logging

logging.disable(logging.WARNING)

In [3]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/root/Data/Row/")
DATA_FOLDER = os.environ.get("DATA_FOLDER", "/root/Data/")
MODELS_FOLDER = os.environ.get("MODELS", "/root/Models/")
PROCESSED_FOLDER = os.environ.get("PROCESSED_FOLDER", "/root/Data/Processed/")
feature_repo_path = os.environ.get("FEAST_PATH", "/root/Data/feast_repo/feature_repo")

BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 512))
from merlin.core.dispatch import get_lib
df_lib = get_lib()
df_lib

<module 'cudf' from '/usr/local/lib/python3.10/dist-packages/cudf/__init__.py'>

## Chargement des datasets

In [4]:
# Modifie les dtypes qui peuvent être en 64 par défaut

def change_dtypes(df):
    for col in df.columns:
        if df[col].dtype == np.int64:
            df[col] = df[col].astype(np.int32)
        elif df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)
    return df

In [5]:
transactions = df_lib.read_csv(os.path.join(DATA_FOLDER, "Row/transactions_train.csv"))

# Pour rendre le démonstrateur plus léger, on ne garde que les 25% du dataset les plus récents 
split = int(len(transactions)/4)*0
transactions = transactions[split:]
transactions['t_dat'] = df_lib.to_datetime(transactions['t_dat'])


customers = df_lib.read_csv(os.path.join(DATA_FOLDER, "Row/customers.csv"))
articles = df_lib.read_csv(os.path.join(DATA_FOLDER, "Row/articles.csv"))

In [6]:
transactions = change_dtypes(transactions)
customers = change_dtypes(customers)
articles = change_dtypes(articles)

Remplissage des valeurs manquantes

In [7]:
customers['FN'] = customers['FN'].fillna(2)
customers['Active'] = customers['Active'].fillna(2)
customers['club_member_status'] = customers['club_member_status'].fillna('NA')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NA')
customers['age'] = customers['age'].fillna(35) #the mean
articles['detail_desc'] = articles['detail_desc'].fillna('NA')
transactions['article_id'] = transactions['article_id'].fillna(0)
transactions['price'] = transactions['price'].fillna(transactions['price'].mean())
transactions['sales_channel_id'] = transactions['sales_channel_id'].fillna(3)

Suppression des colonnes redondantes

In [8]:
articles = articles.drop(['product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'perceived_colour_master_name'], axis=1)

In [9]:
print('transactions :', transactions.columns)
print('customers :', customers.columns)
print('articles :', articles.columns)

transactions : Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id'], dtype='object')
customers : Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')
articles : Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_group_name', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc'],
      dtype='object')


## Construction de features supplémentaires

Pour les items, on ajoute des features pour capturer la popularité et les tendances récentes comme la date depuis la dernière transaction etc.

In [10]:
## Seule ligne à run 
#Time in days since transactions
Last_day = transactions['t_dat'].max()
transactions['days_diff'] = (Last_day - transactions['t_dat']).dt.days

In [11]:
#In the last 30 days before test period
count_30d_purchased = transactions[(transactions['days_diff'] <= 50) & (transactions['days_diff'] >=20)]['article_id'].value_counts().reset_index()
count_30d_purchased.columns = ['article_id', 'count_30d_purchased']

#In the last 7 days before test period
count_7d_purchased = transactions[(transactions['days_diff'] <= 27) & (transactions['days_diff'] >= 20)]['article_id'].value_counts().reset_index()
count_7d_purchased.columns = ['article_id', 'count_7d_purchased']

#Time weighted purchased
filtered_transactions = transactions[transactions['days_diff'] >= 20]

daily_transactions = filtered_transactions.groupby(['article_id', 'days_diff']).size().reset_index()
daily_transactions.columns = ['article_id', 'days_diff', 'nbr_sales']
daily_transactions['Time_Weighted_sales'] = daily_transactions['nbr_sales']/(daily_transactions['days_diff']+1)
Time_Weighted_Purchased = daily_transactions.groupby('article_id')['Time_Weighted_sales'].sum().reset_index()
Time_Weighted_Purchased.columns = ['article_id', 'Time_Weighted_Purchased']

#Merge
articles = articles.merge(count_30d_purchased, on='article_id', how='left')
articles = articles.merge(count_7d_purchased, on='article_id', how='left')
articles = articles.merge(Time_Weighted_Purchased, on='article_id', how='left')

#Fill missing values
articles['count_30d_purchased'] = articles['count_30d_purchased'].fillna(0)
articles['count_7d_purchased'] = articles['count_7d_purchased'].fillna(0)
articles['Time_Weighted_Purchased'] = articles['Time_Weighted_Purchased'].fillna(0)

articles.head(1)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,detail_desc,count_30d_purchased,count_7d_purchased,Time_Weighted_Purchased
0,176209023,176209,Mr Harrington w/hood,308,Garment Upper body,1010016,9,4,5,5283,F,3,31,1007,"Short, padded jacket with a jersey-lined hood ...",6,5,1.561218


Pour les clients, on ajoute dans un premier temps les features classiques de segmentation marketing : Recency, Frequency et Amount

In [12]:
#Recency
filtered_transactions = transactions[transactions['days_diff'] >= 20]
Recency = filtered_transactions.groupby('customer_id')['days_diff'].min().reset_index()
Recency.columns = ['customer_id', 'recency']

#Frequency
Frequency = transactions[(transactions['days_diff'] <=100) & (transactions['days_diff'] >= 20)].groupby('customer_id').size().reset_index()
Frequency.columns = ['customer_id', 'frequency']
Frequency['frequency'] = Frequency['frequency']/100

#Amount
Amount = transactions[(transactions['days_diff'] <= 100) & (transactions['days_diff'] >= 20)].groupby('customer_id')['price'].mean().reset_index()
Amount.columns = ['customer_id', 'amount']

#Merge
customers = customers.merge(Recency, on='customer_id', how='left')
customers = customers.merge(Frequency, on='customer_id', how='left')
customers = customers.merge(Amount, on='customer_id', how='left')

#Fill missing values
#if some customers have not ordered, we set their recency to more than one year
customers['recency'] = customers['recency'].fillna(500)
customers['frequency'] = customers['frequency'].fillna(0)
customers['amount'] = customers['amount'].fillna(0)

customers.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,recency,frequency,amount
0,043595746860df8bdc4b5a6285a36da351c0c455c08153...,2.0,2.0,ACTIVE,NONE,20,23206f1d2e28904d6d07c6d24fd06215b3af9edaf42e10...,109,0.0,0.0
1,043596a4e23fda09021f3286f48c1f3cbe960bcfc2e2a4...,1.0,2.0,ACTIVE,Regularly,60,2c29ae653a9282cce4151bd87643c907644e09541abc28...,728,0.0,0.0


On ajoute ensuite des features qui capturent les goûts et préférences des clients comme la catégorie la plus fréquente des articles achetés etc.

Pour ce faire on liste tous les derniers achats par clients.

In [13]:
transactions_legit =  transactions[transactions['days_diff'] >= 20]
merged = transactions_legit[['t_dat', 'customer_id', 'article_id']].merge(articles[['article_id', 'product_code', 'product_type_no', 'colour_group_code', 'department_no', 'section_no']], on='article_id')
merged = merged.reset_index(drop=True)
#merged.head(2)

In [14]:
grouped = merged.groupby('customer_id').agg({'article_id': 'collect', 'product_code': 'collect', 'product_type_no': 'collect', 'colour_group_code': 'collect', 'department_no': 'collect',
                                                       'section_no': 'collect'})
grouped.head(2)

Unnamed: 0_level_0,article_id,product_code,product_type_no,colour_group_code,department_no,section_no
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,"[625548001, 176209023, 627759010, 697138006, 5...","[625548, 176209, 627759, 697138, 568601, 56860...","[262, 308, 262, 267, 264, 264, 259, 275, 272, ...","[73, 9, 73, 51, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[8852, 5283, 7812, 7616, 1212, 1212, 1515, 177...","[45, 31, 45, 76, 11, 11, 11, 57, 15, 11, 11, 7..."
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,"[583558001, 639677008, 640244003, 521269001, 6...","[583558, 639677, 640244, 521269, 666448, 58355...","[265, 259, 275, 252, 252, 265, 252, 252, 252, ...","[72, 71, 9, 9, 72, 72, 9, 72, 43, 14, 51, 6, 1...","[1344, 1522, 1444, 1647, 1626, 1344, 1616, 162...","[53, 15, 53, 53, 15, 53, 11, 15, 50, 19, 61, 6..."


Quelques fonctions pour extraire les éléments les plus fréquents ou les plus récents dans les listes d'items achetés par clients

In [15]:
from collections import Counter

def most_frequent(lst):
    if len(lst) > 0:
        return Counter(lst).most_common(1)[0][0]
    return 0

def second_most_frequent(lst):
    most_common = Counter(lst).most_common(2)
    if len(most_common) > 1:
        return most_common[1][0]
    return 0

def last_n_elements(lst, n):
    if len(lst) >= n:
        return lst[-n]
    return 0

In [16]:
grouped = grouped.to_pandas()

In [17]:
%%time
user_additional_features_df = df_lib.DataFrame({
    'customer_id': grouped.index,
    'popular_product_type': grouped['product_type_no'].apply(most_frequent),
    '2nd_popular_product_type': grouped['product_type_no'].apply(second_most_frequent),
    'popular_department_no': grouped['department_no'].apply(most_frequent),
    '2nd_popular_department_no': grouped['department_no'].apply(second_most_frequent),
    'popular_section_no': grouped['section_no'].apply(most_frequent),
    '2nd_popular_section_no': grouped['section_no'].apply(second_most_frequent),
    'last_product_code': grouped['product_code'].apply(lambda x: last_n_elements(x, 1)),
    '2nd_last_product_code': grouped['product_code'].apply(lambda x: last_n_elements(x, 2)),
    #'last_article_id': grouped['article_id'].apply(lambda x: last_n_elements(x, 1)),
    #'2nd_last_article_id': grouped['article_id'].apply(lambda x: last_n_elements(x, 2)),
    'last_product_type': grouped['product_type_no'].apply(lambda x: last_n_elements(x, 1)),
    '2nd_last_product_type': grouped['product_type_no'].apply(lambda x: last_n_elements(x, 2)),
})
user_additional_features_df.head(2)

CPU times: user 1min 27s, sys: 2.63 s, total: 1min 30s
Wall time: 1min 26s


Unnamed: 0_level_0,customer_id,popular_product_type,2nd_popular_product_type,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,264,262,1212,1636,11,15,859416,795440,252,253
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,298,59,4242,1647,60,15,826211,351484,252,59


In [18]:
customers = customers.merge(user_additional_features_df, on='customer_id')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,recency,frequency,amount,popular_product_type,2nd_popular_product_type,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type
0,0538ad5c28a869b730aedb71620d2c4044c2b2ae1680f1...,2.0,2.0,ACTIVE,NONE,23,fc60777acf0feb1cacac754e330080dd2b62a10aa1f7d4...,55,0.05,0.015237,252,258,4242,1676,15,60,841434,737800,265,259
1,0538e903df4c8a639e7cbe7ca9976a710d551916c7e0f6...,1.0,1.0,ACTIVE,Regularly,46,9d41b31a33db0b6ba89950b1ed12b816a21f63843b70aa...,34,0.10,0.029814,265,272,4242,1772,15,11,803757,827370,272,252
2,0538f43e1478025b1bda1d3bbce1187bf048073cdc1f50...,1.0,1.0,ACTIVE,Regularly,47,d1b976772eb156a0374ca75083c06e933aa6c535fdebed...,22,0.07,0.053252,263,252,1201,3709,15,19,874704,863583,252,252
3,05392f13d03d2b7c2d3d94300b63d454cd189800e42917...,2.0,2.0,ACTIVE,NONE,33,88b248289e9b83f058638891432e944836335a8859fa41...,52,0.01,0.018627,255,265,1676,3608,16,62,791587,841383,255,253
4,05396049fe558c46bfa9bd60185a713742cf5df554ccea...,2.0,2.0,ACTIVE,NONE,24,530659707a165d6faee772a67e8c931a556bcd7d456b5e...,28,0.07,0.017804,255,272,5828,5883,20,26,785520,888295,264,254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346261,ff6df2d9c7430b9e1dd4845753d65d22abd2a91bac872b...,2.0,2.0,ACTIVE,NONE,17,82e01ac9e98dc02d9f8ffc21453ff137cdf96b7c41c411...,22,0.09,0.025407,252,308,1643,1640,51,53,914886,914886,252,252
1346262,ff3cc7ffef7ce507f7289ff28d7f666efe36654b0bacca...,2.0,2.0,ACTIVE,NONE,48,456b0acc823fab1e32e8295e3457d387235c8eb4e393ee...,328,0.00,0.000000,252,308,5882,1676,55,16,763270,774953,252,263
1346263,ff3cc132d98d2f6d485bf55485dacb1f0c3e874f7f0229...,2.0,2.0,ACTIVE,NONE,27,e40f12643dd588a5b08df2f6b3eb263710b221b36ccf99...,89,0.01,0.059305,275,267,1414,1722,15,11,851400,851400,275,275
1346264,ff6e0668fd77eb4958f29c78fac925e83e289777ccb184...,2.0,2.0,ACTIVE,NONE,63,6c643c812d260c34758484141d2f12dd0cb7d1d8507944...,45,0.04,0.018203,273,258,1722,1522,15,57,856840,399256,265,272


In [19]:
print('transactions :', transactions.columns)
print('customers :', customers.columns)
print('articles :', articles.columns)

transactions : Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'days_diff'],
      dtype='object')
customers : Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code', 'recency', 'frequency',
       'amount', 'popular_product_type', '2nd_popular_product_type',
       'popular_department_no', '2nd_popular_department_no',
       'popular_section_no', '2nd_popular_section_no', 'last_product_code',
       '2nd_last_product_code', 'last_product_type', '2nd_last_product_type'],
      dtype='object')
articles : Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_group_name', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchase

In [21]:
articles.isna().sum()

article_id                    0
product_code                  0
prod_name                     0
product_type_no               0
product_group_name            0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_code                    0
index_group_no                0
section_no                    0
garment_group_no              0
detail_desc                   0
count_30d_purchased           0
count_7d_purchased            0
Time_Weighted_Purchased       0
dtype: int64

## Création du dataset
Création de faux négatifs

In [22]:
transactions = transactions.rename(columns={'article_id': 'item_id', 'customer_id': 'user_id'})
customers = customers.rename(columns={'customer_id': 'user_id'})
articles = articles.rename(columns={'article_id': 'item_id'})

In [23]:
true_set = transactions[transactions['days_diff'] < 20][['user_id', 'item_id']].copy()
true_set['Target'] = 1

all_articles = transactions['item_id'].unique().to_arrow().to_pylist()
all_customers = transactions['user_id'].unique().to_arrow().to_pylist()

# Calculer les poids pour les articles et les utilisateurs
often_purchased = articles[articles['count_30d_purchased'] > 15]
article_weights = often_purchased[often_purchased['item_id'].isin(all_articles)].groupby('item_id')['count_30d_purchased'].max()

# Inverser les poids pour les utilisateurs pour que les recency faibles aient des poids élevés
often_customers = customers[customers['recency'] > 0]
user_weights = often_customers[often_customers['user_id'].isin(all_customers)].groupby('user_id')['recency'].max() + 1.5
user_weights = 1 / (user_weights + 1)

# Normaliser les poids pour obtenir des probabilités
article_weights = article_weights / article_weights.sum()
user_weights = user_weights / user_weights.sum()

# Nouvelle liste de users et d'items
good_articles = often_purchased[often_purchased['item_id'].isin(all_articles)]['item_id'].unique().to_arrow().to_pylist()
good_customers = often_customers[often_customers['user_id'].isin(all_customers)]['user_id'].unique().to_arrow().to_pylist()


In [24]:
false_set = df_lib.DataFrame(columns=['user_id', 'item_id', 'Target'])
batch_size = 10000  # adjust this based on your memory capacity

# Create a unique id for each pair in true_set
true_set['pair_id'] = true_set['item_id'].astype(str).str.cat(true_set['user_id'].astype(str), sep='_')

while len(false_set) < len(true_set): # positive rate
    # Select a batch of random rows from true_set
    random_rows = true_set.sample(n=batch_size).copy()
   
    # Select a batch of random users and items
    random_users = np.random.choice(good_customers, size=batch_size, replace=True, p=user_weights.loc[good_customers].values.get())
    random_items = np.random.choice(good_articles, size=batch_size, replace=True, p=article_weights.loc[good_articles].values.get())
    
    # Create a new DataFrame for the false negatives
    random_rows['user_id'] = random_users
    random_rows['item_id'] = random_items
    random_rows['Target'] = 0

    # Create a unique id for each pair in false_negatives
    random_rows['pair_id'] = random_rows['item_id'].astype(str).str.cat(random_rows['user_id'].astype(str), sep='_')

    # Check that the pairs are not in true_set
    random_rows = random_rows[~random_rows['pair_id'].isin(true_set['pair_id'])]

    false_set = df_lib.concat([false_set, random_rows])

In [25]:
# Concatenate true positives and false negatives to get the final dataset
dataset = df_lib.concat([true_set, false_set])
#Shuffle the dataset
dataset = dataset.sample(frac=1)

# Drop the pair_id column
dataset = dataset.drop(columns='pair_id')

dataset['Target'] = dataset['Target'].astype('int32')

import gc
del true_set
del false_set
gc.collect()

260

In [26]:
dataset

Unnamed: 0,user_id,item_id,Target
31750654,d9e3f0282322eb65d8283337e129948caf66f38df25c3e...,372860002,1
31475585,941fc89f56614f253299799dafb2242a924d008d3a1be5...,749974008,1
31595183,7ef442c81ba07cd8c8784f3330a77ecf26bbd6c1992339...,896152003,1
31711168,9b3c78dca755ee8aed27cb3be0d307a6b0f2da3c84e1ee...,736923010,1
31527340,3554ba568676a3aeb7f8dd523e637e6bd555c45a116b9d...,904584007,1
...,...,...,...
31359744,a31499a932a58b5a133d9297ef6419493092b6a83ab17e...,859743004,1
31332155,f19acac6bcb332891383877388c590d752d24e2ce9d1d5...,892254001,1
31448781,e85ef6e2f3953f7a732b8016724b6ef6033ab3f5c38f97...,926502001,1
31114545,cd7fa5ed5a1b34ca513105fbe5950ebbb7cab88947fc8c...,739819010,0


In [27]:
dataset = dataset.merge(articles, on='item_id', how='left')
#print(dataset.isna().sum())

user_id                       0
item_id                       0
Target                        0
product_code                  0
prod_name                     0
product_type_no               0
product_group_name            0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_code                    0
index_group_no                0
section_no                    0
garment_group_no              0
detail_desc                   0
count_30d_purchased           0
count_7d_purchased            0
Time_Weighted_Purchased       0
dtype: int64


In [30]:
print(dataset['user_id'].nunique())

valid_user_ids = customers['user_id'].unique()
filtered_dataset = dataset[dataset['user_id'].isin(valid_user_ids)]
print(filtered_dataset['user_id'].nunique())

550140
534125


In [31]:
merged_dataset = customers.merge(filtered_dataset, on='user_id', how='right')

In [32]:
dataset = merged_dataset
merged_dataset.isna().sum()

user_id                       0
item_id                       0
Target                        0
product_code                  0
prod_name                     0
product_type_no               0
product_group_name            0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_code                    0
index_group_no                0
section_no                    0
garment_group_no              0
detail_desc                   0
count_30d_purchased           0
count_7d_purchased            0
Time_Weighted_Purchased       0
FN                            0
Active                        0
club_member_status            0
fashion_news_frequency        0
age                           0
postal_code                   0
recency                       0
frequency                     0
amount                        0
popular_product_type          0
2nd_popular_product_type      0
popular_

In [33]:
dataset.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 1401960 entries, 0 to 1401959
Data columns (total 39 columns):
 #   Column                      Non-Null Count    Dtype
---  ------                      --------------    -----
 0   user_id                     1401960 non-null  object
 1   item_id                     1401960 non-null  int64
 2   Target                      1401960 non-null  int32
 3   product_code                1401960 non-null  int32
 4   prod_name                   1401960 non-null  object
 5   product_type_no             1401960 non-null  int32
 6   product_group_name          1401960 non-null  object
 7   graphical_appearance_no     1401960 non-null  int32
 8   colour_group_code           1401960 non-null  int32
 9   perceived_colour_value_id   1401960 non-null  int32
 10  perceived_colour_master_id  1401960 non-null  int32
 11  department_no               1401960 non-null  int32
 12  index_code                  1401960 non-null  object
 13  index_group_no       

In [34]:
dataset.head(2)

Unnamed: 0,user_id,item_id,Target,product_code,prod_name,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,...,popular_product_type,2nd_popular_product_type,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type
0,42a54fc7b27753430d0b8219a76433e34713abb17529ef...,898694001,0,898694,Jasba jersey shacket,264,Garment Upper body,1010010,12,1,...,272,265,1322,1909,15,2,817361,892051,265,272
1,cb85f198a753d60bdb90c933af4537e7f04661ee999142...,865071001,0,865071,Pumba Heavy utility,274,Garment Lower body,1010016,10,3,...,253,275,1444,1640,53,66,910314,836747,79,230


In [35]:
dataset.columns

Index(['user_id', 'item_id', 'Target', 'product_code', 'prod_name',
       'product_type_no', 'product_group_name', 'graphical_appearance_no',
       'colour_group_code', 'perceived_colour_value_id',
       'perceived_colour_master_id', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'detail_desc',
       'count_30d_purchased', 'count_7d_purchased', 'Time_Weighted_Purchased',
       'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age',
       'postal_code', 'recency', 'frequency', 'amount', 'popular_product_type',
       '2nd_popular_product_type', 'popular_department_no',
       '2nd_popular_department_no', 'popular_section_no',
       '2nd_popular_section_no', 'last_product_code', '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type'],
      dtype='object')

## Conversion en train-test numeric

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Séparer les caractéristiques et la cible
dataset['user_row'] = dataset['user_id'].copy()
dataset = change_dtypes(dataset)
X = dataset.drop('Target', axis=1)
y = dataset['Target']

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
X_train.columns

Index(['user_id', 'item_id', 'product_code', 'prod_name', 'product_type_no',
       'product_group_name', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchased', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'postal_code',
       'recency', 'frequency', 'amount', 'popular_product_type',
       '2nd_popular_product_type', 'popular_department_no',
       '2nd_popular_department_no', 'popular_section_no',
       '2nd_popular_section_no', 'last_product_code', '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type', 'user_row'],
      dtype='object')

In [159]:
categorical_columns = ['prod_name', 'product_group_name', 'index_code', 'detail_desc', 'FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code']
other_features = ['user_id', 'item_id', 'product_code', 'product_type_no', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_group_no', 'section_no',
       'garment_group_no', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchased', 'age',
       'recency', 'frequency', 'amount', 'user_row',  'popular_product_type',
       '2nd_popular_product_type', 'popular_department_no',
       '2nd_popular_department_no', 'popular_section_no',
       '2nd_popular_section_no', 'last_product_code', '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type']
print(len(categorical_columns)+len(other_features))
print(len(X_train.columns))

39
39


In [160]:
# categorify
features_cat = categorical_columns >> Categorify(dtype="int32") 
features = features_cat + other_features #>> Dropna()

workflow_encode = nvt.Workflow(features)

train_dataset = nvt.Dataset(X_train)
test_dataset = nvt.Dataset(X_test)

# transform
workflow_encode.fit(train_dataset)

train_transformed = workflow_encode.transform(train_dataset).to_ddf().compute()
test_transformed = workflow_encode.transform(test_dataset).to_ddf().compute()

In [161]:
print(len(train_transformed), len(y_train))

981367 981367


In [162]:
train_transformed.isna().sum()

prod_name                     0
product_group_name            0
index_code                    0
detail_desc                   0
FN                            0
Active                        0
club_member_status            0
fashion_news_frequency        0
postal_code                   0
user_id                       0
item_id                       0
product_code                  0
product_type_no               0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_group_no                0
section_no                    0
garment_group_no              0
count_30d_purchased           0
count_7d_purchased            0
Time_Weighted_Purchased       0
age                           0
recency                       0
frequency                     0
amount                        0
user_row                      0
popular_product_type          0
2nd_popular_product_type      0
popular_

In [163]:
train_transformed.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 981367 entries, 0 to 981366
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype
---  ------                      --------------   -----
 0   prod_name                   981367 non-null  int32
 1   product_group_name          981367 non-null  int32
 2   index_code                  981367 non-null  int32
 3   detail_desc                 981367 non-null  int32
 4   FN                          981367 non-null  int32
 5   Active                      981367 non-null  int32
 6   club_member_status          981367 non-null  int32
 7   fashion_news_frequency      981367 non-null  int32
 8   postal_code                 981367 non-null  int32
 9   user_id                     981367 non-null  object
 10  item_id                     981367 non-null  int32
 11  product_code                981367 non-null  int32
 12  product_type_no             981367 non-null  int32
 13  graphical_appearance_no     981367 non-nu

In [164]:
train_transformed

Unnamed: 0,prod_name,product_group_name,index_code,detail_desc,FN,Active,club_member_status,fashion_news_frequency,postal_code,user_id,...,popular_product_type,2nd_popular_product_type,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type
0,844,3,6,554,3,3,3,3,92209,4d6faa79a218653afd3671cf24cf4cd50091b630a3c178...,...,265,67,3509,1322,15,53,883041,816586,265,308
1,496,3,4,520,3,3,3,3,83934,665af5f782b8a6d5a3978ffce3da3b737d06481ff0151b...,...,512,265,4343,1444,53,66,853467,886225,95,267
2,843,3,3,779,3,3,3,3,19692,681ef9abe85ee536bb3ab3ae28cde13cf184aea3ac36ec...,...,272,308,1747,1640,53,51,880492,885077,70,77
3,198,6,5,204,4,4,3,4,28225,ebf71024a1951c7d12a09c43a5d506f96f582d28d9a7e0...,...,265,253,4242,1338,15,61,746994,746994,274,274
4,4029,10,7,3793,4,4,3,4,55923,cff0943754d0257863700fc714ba452ce765ce9f1eebac...,...,258,253,1522,1926,53,15,595696,832298,274,298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981362,1480,4,3,1450,4,4,3,4,5733,9cadbb306d2cec3b3898ffd2e38993da5e2fe087ceb1e6...,...,265,92,1344,3528,53,64,890386,881933,92,265
981363,232,11,5,239,4,4,3,4,223003,05e9b007134e70f007895edcce21dcea2ff1cbff269175...,...,272,255,5828,1772,15,11,789772,399223,259,272
981364,2055,8,7,1978,3,3,3,3,138502,44904bc1c9c301d178dd5cf40ba584d62378d373449504...,...,306,286,1338,4242,61,66,901955,834217,298,59
981365,2171,8,7,2100,3,3,3,3,125658,41cc56295cb04de4ffa7c98ca5e2643155092a7770c731...,...,265,267,1640,1344,53,15,509091,632784,286,78


### Sauvegarde des datasets pour test sklearn

In [165]:
#dataset.to_parquet(DATA_FOLDER+'/Processed/dataset_numeric')
train_transformed.to_csv(DATA_FOLDER+'/Plot/train_numeric')
test_transformed.to_csv(DATA_FOLDER+'/Plot/test_numeric')
y_train.to_frame(name='Target').to_csv(DATA_FOLDER+'/Plot/ytrain_numeric')
y_test.to_frame(name='Target').to_csv(DATA_FOLDER+'/Plot/ytest_numeric')

In [197]:
y_train

262376    1
39725     1
570416    0
843927    0
513093    0
         ..
110268    0
259178    0
131932    1
671155    0
121958    1
Name: Target, Length: 981367, dtype: int32

### Sauvegarde des datasets Merlin

In [202]:
train_transformed = train_transformed.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
train_to_merlin = df_lib.concat([train_transformed, y_train], axis=1)

test_transformed = test_transformed.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
test_to_merlin = df_lib.concat([test_transformed, y_test], axis=1)

print(train_to_merlin.columns)

prod_name                     0
product_group_name            0
index_code                    0
detail_desc                   0
FN                            0
Active                        0
club_member_status            0
fashion_news_frequency        0
postal_code                   0
user_id                       0
item_id                       0
product_code                  0
product_type_no               0
graphical_appearance_no       0
colour_group_code             0
perceived_colour_value_id     0
perceived_colour_master_id    0
department_no                 0
index_group_no                0
section_no                    0
garment_group_no              0
count_30d_purchased           0
count_7d_purchased            0
Time_Weighted_Purchased       0
age                           0
recency                       0
frequency                     0
amount                        0
user_row                      0
popular_product_type          0
2nd_popular_product_type      0
popular_

In [205]:
train_to_merlin.head(3)

Unnamed: 0,prod_name,product_group_name,index_code,detail_desc,FN,Active,club_member_status,fashion_news_frequency,postal_code,user_id,...,2nd_popular_product_type,popular_department_no,2nd_popular_department_no,popular_section_no,2nd_popular_section_no,last_product_code,2nd_last_product_code,last_product_type,2nd_last_product_type,Target
0,844,3,6,554,3,3,3,3,92209,4d6faa79a218653afd3671cf24cf4cd50091b630a3c178...,...,67,3509,1322,15,53,883041,816586,265,308,1
1,496,3,4,520,3,3,3,3,83934,665af5f782b8a6d5a3978ffce3da3b737d06481ff0151b...,...,265,4343,1444,53,66,853467,886225,95,267,1
2,843,3,3,779,3,3,3,3,19692,681ef9abe85ee536bb3ab3ae28cde13cf184aea3ac36ec...,...,308,1747,1640,53,51,880492,885077,70,77,0


In [169]:
customers_features = ['FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code', 'recency', 'frequency',
       'amount', 'popular_product_type', '2nd_popular_product_type',
       'popular_department_no', '2nd_popular_department_no',
       'popular_section_no', '2nd_popular_section_no', 'last_product_code',
       '2nd_last_product_code', 'last_product_type', '2nd_last_product_type']

articles_features = ['product_code', 'prod_name', 'product_type_no',
       'product_group_name', 'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchased']

print(len(customers_features)+ len(articles_features), len(train_transformed.columns))

36 39


In [223]:
tag_user_id = ["user_id"] >> Categorify(dtype='int32') >> TagAsUserID()
tag_user_features = customers_features  >> TagAsUserFeatures() 
tag_item_id = ["item_id"] >> TagAsItemID()
tag_item_features = articles_features   >> TagAsItemFeatures() 
targets = ["Target"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, "target"]) >> Dropna()


tag_to_select_unique_values = nvt.Workflow(tag_user_id + tag_user_features + tag_item_id + tag_item_features + targets)

In [224]:
train_dataset = Dataset(train_to_merlin)
test_dataset = Dataset(test_to_merlin)

train_df = tag_to_select_unique_values.fit_transform(train_dataset)
test_df = tag_to_select_unique_values.transform(test_dataset)

In [226]:
#train_transformed.isna().sum()
train_df.compute()

Unnamed: 0,user_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,recency,frequency,amount,...,department_no,index_code,index_group_no,section_no,garment_group_no,detail_desc,count_30d_purchased,count_7d_purchased,Time_Weighted_Purchased,Target
0,79456,3,3,3,3,33,92209,203,0.00,0.000000,...,5848,6,3,21,1005,554,36,5,3.901209,1
1,52787,3,3,3,3,52,83934,94,0.04,0.041508,...,1640,4,2,53,1005,520,368,110,11.513096,1
2,83998,3,3,3,3,27,19692,23,0.05,0.022373,...,1610,3,1,6,1003,779,62,5,10.164483,0
3,192051,4,4,3,4,26,28225,98,0.17,0.030990,...,1334,5,1,61,1017,204,198,52,15.038668,0
4,41206,4,4,3,4,25,55923,24,0.08,0.012059,...,3040,7,1,64,1020,3793,30,0,4.599783,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981362,164145,4,4,3,4,44,5733,63,0.02,0.045746,...,1723,3,1,15,1025,1450,74,8,4.021278,0
981363,204713,4,4,3,4,50,223003,203,0.00,0.000000,...,1339,5,1,61,1017,239,432,170,33.031651,0
981364,264175,3,3,3,3,22,138502,106,0.00,0.000000,...,4344,7,1,66,1019,1978,89,35,3.013747,1
981365,132291,3,3,3,3,21,125658,49,0.02,0.011000,...,4344,7,1,66,1019,2100,117,34,8.511708,0


In [227]:
# Mettre en dataset 
train_df.to_parquet(DATA_FOLDER+'/Processed/train_numeric')
test_df.to_parquet(DATA_FOLDER+'/Processed/test_numeric')

df_user_id_row_processed = df_customers[['user_id', 'user_id_row']]
df_user_id_row_processed
df_user_id_row_processed.to_parquet(DATA_FOLDER+'/Processed/user_id_row_processed')