# SETUP

In [1]:
# Read libraries
import pandas as pd
import numpy as np
import os
import re
import networkx as nx
from tqdm import tqdm



# ML libraries
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from langdetect import detect, DetectorFactory
import torch
import umap
# Open file in read mode

taxonomy_path = os.path.join("data", "taxonomy.txt")
count_of_products_per_level1_path = os.path.join("data", "count_of_products_per_level1.csv")
data_path = os.path.join("data", "ensae_export_without_l1.parquet")

  from .autonotebook import tqdm as notebook_tqdm


## Read categories files

In [2]:
# --------- Lire la taxonomy depuis un fichier txt ----------
# Assumons que le fichier s'appelle "taxonomy.txt"
# Format attendu : id_path <tab> category_path
df_taxonomy = pd.read_csv(taxonomy_path, sep='\t', header=None, names=['id_path', 'category_path'])

# Nettoyage
df_taxonomy['category_path'] = df_taxonomy['category_path'].str.strip()
df_taxonomy['id_path'] = df_taxonomy['id_path'].str.strip()

# Construire le graphe dirigé de la taxonomie
G = nx.DiGraph()
root = "ROOT"  # racine commune
G.add_node(root)

for path in df_taxonomy['category_path']:
    parts = [p.strip() for p in path.split(">")]
    if parts:  # relier le level_1 à la racine
        G.add_edge(root, parts[0])
    for i in range(len(parts)-1):
        parent = parts[i]
        child = parts[i+1]
        G.add_edge(parent, child)

# Identifier les level_1
level_1_nodes = [p.split(">")[0].strip() for p in df_taxonomy['category_path']]
level_1_nodes = list(set(level_1_nodes))

print("Level 1 categories:", level_1_nodes)

# Créer un DataFrame par level_1 avec la concaténation de tous ses descendants
level1_texts = {}
for lvl1 in level_1_nodes:
    descendants = nx.descendants(G, lvl1)
    all_nodes = list(descendants) + [lvl1]
    # enlever doublons et concaténer
    text = " ".join(sorted(set(all_nodes)))
    level1_texts[lvl1] = text

df_level1 = pd.DataFrame.from_dict(level1_texts, orient='index', columns=['concatenated_text'])
df_level1.reset_index(inplace=True)
df_level1 = df_level1.rename(columns={'index': 'level_1_category'})

display(df_level1.head())

Level 1 categories: ['Dating', 'home & garden', 'food, beverages & tobacco', 'office supplies', 'religious & ceremonial', 'health & beauty', 'business & industrial', 'Employment', 'software', 'mature', 'Ground/Cruises/Packages', 'cameras & optics', 'arts & entertainment', 'Services', 'furniture', 'Airlines', 'Car Rental', 'Gaming/Gambling', 'electronics', 'sporting goods', 'baby & toddler', 'animals & pet supplies', 'Travel', 'Goods', 'media', 'toys & games', 'vehicles & parts', 'luggage & bags', 'Finance Services', 'Communication', 'Hotels/Resorts', 'Real Estate', 'apparel & accessories', 'hardware']


Unnamed: 0,level_1_category,concatenated_text
0,Dating,Dating
1,home & garden,absinthe fountains address signs advent calend...
2,"food, beverages & tobacco",absinthe alcoholic beverages amaranth apple bu...
3,office supplies,address books address labels anti-fatigue mats...
4,religious & ceremonial,aisle runners flower girl baskets memorial cer...


In [3]:
# Extract level 1 category counts

df_categories_count = pd.read_csv(count_of_products_per_level1_path)
df_categories_count

# Keep only level 1 categories present in both dataframes
common_level1 = set(df_level1['level_1_category']).intersection(set(df_categories_count['level_1_name']))
df_level1_clean = df_level1[df_level1['level_1_category'].isin(common_level1)]

print(f"Drop {len(df_level1['level_1_category'].unique()) - len(common_level1)} level 1 categories not in count_of_products_per_level1.csv")

Drop 13 level 1 categories not in count_of_products_per_level1.csv


In [4]:
print("Number of products : ", df_categories_count["count"].sum())

Number of products :  128253


## Read catalog files

In [5]:
df_catalog = pd.read_parquet(data_path, engine='pyarrow')

In [6]:
df_catalog

Unnamed: 0,hashed_external_id,title,description,brand,sale_price
0,-2772291400701920348,The Hoodoo Tarot,A divination deck and guidebook rooted in the ...,,35.0
1,-4184851053829790189,Disney Villains Tarot Deck and Guidebook Movie...,"Let Maleficent, Captain Hook, and other classi...",,24.99
2,-8778697834751578524,Easy Tarot,"Created especially for beginners, the Easy Tar...",,19.95
3,-3541475158234224984,The Proudest Blue: A Story of Hijab and Family...,The Instant New York Times Bestseller! A power...,,17.99
4,-2529310467283008815,The Crystal Magic Tarot: Understand and Contro...,The Crystal Tarot deck and guidebook expands t...,,24.95
...,...,...,...,...,...
128248,-731841779633851509,Columbia Natural Feather and Down Side Sleeper...,Experience cloud-like comfort with the Columbi...,Columbia,77.7
128249,-8372033233842851509,Campania International Marlton Planter,Campania's Lite material convincingly replicat...,Campania International,229.6
128250,-5594512849126851501,Beatrice Home Grand Hotel Waffle Knit Cotton B...,If you're on the quest for the perfect blanket...,beatrice home fashions,38.08
128251,-402032100371851498,Slickblue Snow Shovel with Wheels with 30 Inch...,"Measuring 30"" x 10"" (L x W), the wide snow bla...",Hot Wheels,96.99


## Preprocessing df_catalog

In [7]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

def preprocess_for_nlp(df, text_cols=['title', 'description'], brand_col='brand', id_col='hashed_external_id' , price_col = 'sale_price'):
    """
    Preprocess products DataFrame for NLP tasks:
    - Concatenate text columns into a single 'text' column
    - Encode brand as integer
    - Clean text: lowercasing, remove punctuation, multiple spaces
    - Keep hashed_external_id for final output

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    text_cols : list of str
        Columns to concatenate for text
    brand_col : str
        Column to encode as integer
    id_col : str
        Column to keep for external mapping
    price_col : str
        Column for price information

    Returns
    -------
    df_preprocessed : pd.DataFrame
        DataFrame with columns: 'hashed_external_id', 'text', brand_col (encoded)
    label_enc : LabelEncoder
        Fitted LabelEncoder for the brand column
    """
    df = df.copy()

    # Fill missing values for text columns
    for col in text_cols:
        df[col] = df[col].fillna('')

    # Concatenate text columns
    df['text'] = df[text_cols].agg(' '.join, axis=1)

    # Clean text
    def clean_text(s):
        s = s.lower()
        s = re.sub(r'\s+', ' ', s)      # multiple spaces -> single space
        s = re.sub(r'[^\w\s]', '', s)   # remove punctuation
        return s.strip()

    df['text'] = df['text'].apply(clean_text)

    # Encode brand as integer
    if brand_col in df.columns:
        df[brand_col] = df[brand_col].fillna('Unknown')
        label_enc = LabelEncoder()
        df[brand_col + '_encoded'] = label_enc.fit_transform(df[brand_col])
    else:
        label_enc = None

    # Fill na for price column
    if price_col in df.columns:
        df[price_col] = df[price_col].astype(float)
        df[price_col] = df[price_col].fillna(df[price_col].median())

    # Keep hashed_external_id
    columns_to_keep = [id_col, 'text' , price_col]
    if label_enc:
        columns_to_keep.append(brand_col + '_encoded')

    return df[columns_to_keep], label_enc

# --------- Usage ---------
df_nlp, brand_encoder = preprocess_for_nlp(
    df_catalog,
    text_cols=['title','description','brand'],
    brand_col='brand',
    id_col='hashed_external_id',
    price_col = 'sale_price'
)

df_nlp.head()

Unnamed: 0,hashed_external_id,text,sale_price,brand_encoded
0,-2772291400701920348,the hoodoo tarot a divination deck and guidebo...,35.0,0
1,-4184851053829790189,disney villains tarot deck and guidebook movie...,24.99,0
2,-8778697834751578524,easy tarot created especially for beginners th...,19.95,0
3,-3541475158234224984,the proudest blue a story of hijab and family ...,17.99,0
4,-2529310467283008815,the crystal magic tarot understand and control...,24.95,0


### check language distribution

In [None]:
# Pour rendre les résultats reproductibles
DetectorFactory.seed = 0

def detect_language(text):
    """
    Detect the language of a given text.
    
    Parameters
    ----------
    text : str
        Input text
    
    Returns
    -------
    lang_code : str
        ISO 639-1 language code (e.g., 'en' for English)
    """
    try:
        return detect(text)
    except:
        return "unknown"

language_df_level1_clean = df_level1_clean["concatenated_text"].apply(detect_language)
language_df_nlp = df_nlp["text"].apply(detect_language)

In [60]:
language_df_level1_clean.value_counts()
language_df_nlp.value_counts()

text
en    128107
es        37
fr        30
it        20
ca        16
no        10
af         8
nl         7
da         5
ro         4
tl         2
sv         2
lt         2
cy         2
et         1
Name: count, dtype: int64

the vast majority of the text seems to be in English, with some other languages mixed in.

Decide to drop non-English entries for simplicity.


In [64]:
language_df_level1_clean.value_counts().to_csv("language_distribution_level1_clean.csv", index=True)
language_df_nlp.value_counts().to_csv("language_distribution_nlp.csv", index=True)

In [65]:
## drop non-english entries

df_nlp = df_nlp[language_df_nlp == 'en']

In [69]:
df_level1

Unnamed: 0,level_1_category,concatenated_text
0,media,books camera & optics manuals carpentry & wood...
1,religious & ceremonial,aisle runners flower girl baskets memorial cer...
2,Airlines,Airlines
3,hardware,abrasive blaster accessories abrasive blasters...
4,health & beauty,accessibility equipment accessibility equipmen...
5,electronics,3d glasses 3d printer accessories 3d printers ...
6,sporting goods,ab wheels & rollers aerobic steps air hockey a...
7,software,3d modeling software animation editing softwar...
8,Finance Services,Finance Services
9,Travel,Travel


# Generate embeddings with sentence transformers

In [13]:
df_nlp

Unnamed: 0,hashed_external_id,text,sale_price,brand_encoded
0,-2772291400701920348,the hoodoo tarot a divination deck and guidebo...,35.00,0
1,-4184851053829790189,disney villains tarot deck and guidebook movie...,24.99,0
2,-8778697834751578524,easy tarot created especially for beginners th...,19.95,0
3,-3541475158234224984,the proudest blue a story of hijab and family ...,17.99,0
4,-2529310467283008815,the crystal magic tarot understand and control...,24.95,0
...,...,...,...,...
128248,-731841779633851509,columbia natural feather and down side sleeper...,77.70,551
128249,-8372033233842851509,campania international marlton planter campani...,229.60,433
128250,-5594512849126851501,beatrice home grand hotel waffle knit cotton b...,38.08,2694
128251,-402032100371851498,slickblue snow shovel with wheels with 30 inch...,96.99,1093


In [15]:
# ---------- 1. Choisir le device ----------
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

# ---------- 2. Charger le modèle ----------
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# ---------- 3. Préparer les textes ----------
texts_catalog = df_nlp['text'].tolist()
texts_taxonomy = [
    lvl1 + " " + text
    for lvl1, text in zip(df_level1['level_1_category'], df_level1['concatenated_text'])
]

# ---------- 4. Encode avec batch et tqdm ----------
def encode_texts(texts, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        batch_emb = model.encode(
            batch_texts,
            convert_to_numpy=True,
            device=device,
            show_progress_bar=False
        )
        embeddings.append(batch_emb)
    return np.vstack(embeddings)

emb_catalog = encode_texts(texts_catalog, batch_size=64)
emb_taxonomy = encode_texts(texts_taxonomy, batch_size=64)

print("Catalogue embeddings shape:", emb_catalog.shape)
print("Taxonomy embeddings shape:", emb_taxonomy.shape)

# ---------- 5. Sauvegarde compressée ----------
np.savez_compressed("emb_catalog.npz", emb_catalog)
np.savez_compressed("emb_taxonomy.npz", emb_taxonomy)

Using device: mps


100%|██████████| 2004/2004 [31:46<00:00,  1.05it/s]
100%|██████████| 1/1 [00:01<00:00,  1.75s/it]


Catalogue embeddings shape: (128253, 384)
Taxonomy embeddings shape: (34, 384)


## Save embeddings 

In [78]:
np.save("embeddings_catalog", arr=emb_catalog)
np.save("embeddings_taxonomy", arr=emb_taxonomy)

In [85]:
df_nlp

Unnamed: 0,hashed_external_id,text,sale_price,brand_encoded
0,-2772291400701920348,the hoodoo tarot a divination deck and guidebo...,35.0,0
1,-4184851053829790189,disney villains tarot deck and guidebook movie...,24.99,0
2,-8778697834751578524,easy tarot created especially for beginners th...,19.95,0
3,-3541475158234224984,the proudest blue a story of hijab and family ...,17.99,0
4,-2529310467283008815,the crystal magic tarot understand and control...,24.95,0
...,...,...,...,...
128248,-731841779633851509,columbia natural feather and down side sleeper...,77.7,551
128249,-8372033233842851509,campania international marlton planter campani...,229.6,433
128250,-5594512849126851501,beatrice home grand hotel waffle knit cotton b...,38.08,2694
128251,-402032100371851498,slickblue snow shovel with wheels with 30 inch...,96.99,1093


In [10]:
df_nlp

Unnamed: 0,hashed_external_id,text,sale_price,brand_encoded
0,-2772291400701920348,the hoodoo tarot a divination deck and guidebo...,35.00,0
1,-4184851053829790189,disney villains tarot deck and guidebook movie...,24.99,0
2,-8778697834751578524,easy tarot created especially for beginners th...,19.95,0
3,-3541475158234224984,the proudest blue a story of hijab and family ...,17.99,0
4,-2529310467283008815,the crystal magic tarot understand and control...,24.95,0
...,...,...,...,...
128248,-731841779633851509,columbia natural feather and down side sleeper...,77.70,551
128249,-8372033233842851509,campania international marlton planter campani...,229.60,433
128250,-5594512849126851501,beatrice home grand hotel waffle knit cotton b...,38.08,2694
128251,-402032100371851498,slickblue snow shovel with wheels with 30 inch...,96.99,1093


### KNN

In [16]:
# --------------------------
# 1. Scale price + brand
# --------------------------
price_scaler = MinMaxScaler()
price_scaled = price_scaler.fit_transform(df_nlp[['sale_price']])

brand_scaler = StandardScaler()
brand_scaled = brand_scaler.fit_transform(df_nlp[['brand_encoded']])

# --------------------------
# 2. UMAP reduction
# --------------------------
umap_dim = 50  # typical values: 50, 100
umap_model = umap.UMAP(
    n_components=umap_dim,
    n_neighbors=50,       # higher for global structure
    min_dist=0.1,
    metric="cosine"
)

emb_catalog_umap = umap_model.fit_transform(emb_catalog)
emb_taxonomy_umap = umap_model.transform(emb_taxonomy)

# --------------------------
# 3. Combine reduced embedding + numeric features
# --------------------------
X_products = np.hstack([emb_catalog_umap, price_scaled, brand_scaled])

zeros_tax = np.zeros((emb_taxonomy.shape[0], 2))
X_tax = np.hstack([emb_taxonomy_umap, zeros_tax])

# --------------------------
# 4. KMeans with category prototypes
# --------------------------
k = X_tax.shape[0]
kmeans = KMeans(
    n_clusters=k,
    init=X_tax,
    n_init=1,
    max_iter=300
)

kmeans.fit(X_products)
product_cluster_idx = kmeans.predict(X_products)

# --------------------------
# 5. Assign category to products
# --------------------------
categories = df_level1['level_1_category'].tolist()
cluster_to_category = {i: cat for i, cat in enumerate(categories)}
df_nlp['predicted_category'] = [cluster_to_category[i] for i in product_cluster_idx]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
