# Articles Feature Engineering

Define constants and configurations

In [48]:
from typing import List

import torch
from loguru import logger
from tqdm.auto import tqdm

FEATURES_EMBEDDING_MODEL_ID: str = 'all-MiniLM-L6-v2'

Unzip the datasets

In [None]:
import zipfile
from pathlib import Path

# Setup path to data folder
data_path = Path('../data/')
unzipped_data_path = data_path / 'unzipped'

# If the image folder does not exists, download the image data
if not unzipped_data_path.is_dir():
    print(f'{unzipped_data_path} does not exist, creating one...')
    unzipped_data_path.mkdir(parents=True, exist_ok=True)

# Unzip the raw articles data
with zipfile.ZipFile(data_path / 'raw' / 'articles.csv.zip', 'r') as zip_ref:
    print('Unzipping articles dataset...')
    zip_ref.extractall(unzipped_data_path)

Unzipping articles dataset...
Unzipping customers dataset...
Unzipping transactions training dataset...


In [28]:
import pandas as pd

articles_df = pd.read_csv(unzipped_data_path / 'articles.csv')
articles_df.shape

(105542, 25)

In [29]:
articles_df.head(5)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [30]:
import random

# See a sample article data
articles_df.iloc[random.randint(0, len(articles_df))]

article_id                                                              481111011
product_code                                                               481111
prod_name                                                     Trinity Push Mirny^
product_type_no                                                               306
product_type_name                                                             Bra
product_group_name                                                      Underwear
graphical_appearance_no                                                   1010016
graphical_appearance_name                                                   Solid
colour_group_code                                                              51
colour_group_name                                                      Light Pink
perceived_colour_value_id                                                       1
perceived_colour_value_name                                           Dusty Light
perceived_colour

In [31]:
# Let's examine the colums data types
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [32]:
# And also check for NaN values
articles_df.isna().sum()

article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
dtype: int64

As we can see, the `article_id` is a number, we will convert it to a string just to make it clear that the number does not represent any numerical significance. Additionally, there are some missing values for the `detail_desc` column. To solve this issue, we will create our own product description which summarises the appearance, color, and category.

In [33]:
# A utility function to create an article description.
def create_article_description(row):
    description = (
        f"{row['prod_name']} - {row['product_type_name']} in {row['product_group_name']}"
        f"\nAppearance: {row['graphical_appearance_name']}"
        f"\nColor: {row['perceived_colour_value_name']} {row['perceived_colour_master_name']} ({row['colour_group_name']})"
        f"\nCategory: {row['index_group_name']} = {row['section_name']} - {row['garment_group_name']}"
    )

    # If detail_desc for the particular sample exists, append it to the current description
    if pd.notna(row['detail_desc']) and row['detail_desc']:
        description += f"\nDetails {row['detail_desc']}"

    return description

In [34]:
# A utility function to compute the articles features and update the dataframe
def compute_article_features(df):
    # Now, we modify the original article dataframe
    df = df.assign(
        article_id=df['article_id'].astype(str),
        article_description=df.apply(create_article_description, axis=1),
    )

    # Drop columns with any nulls
    df = df.dropna(axis=1, how='any')

    # Remove detail_desc column as its information is already included in article_description
    cols_to_drop = [
        column
        for column in ['detail_desc', 'detail_desc_length']
        if column in df.columns
    ]
    df = df.drop(columns=cols_to_drop)

    return df

In [35]:
articles_df = compute_article_features(articles_df)
articles_df.shape

(105542, 25)

In [38]:
articles_df.head(3)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,article_description
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Strap top - Vest top in Garment Upper body\nAp...
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Strap top - Vest top in Garment Upper body\nAp...
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Strap top (1) - Vest top in Garment Upper body...


Next, we will turn the article descriptions into their embedding counterpart For this purpose, we will use HuggingFace's [SentenceTransformer](https://huggingface.co/sentence-transformers), specifically we will use the "all-MiniLM-L6-v2" embedding model. (There isn't exact reason why I use this particular model. I use it because it is what shown on HF's docs)

This embedding is the one that will be fed to the recommender model (the candidate tower).

In [41]:
from sentence_transformers import SentenceTransformer

In [44]:
for i, desc in enumerate(articles_df['article_description'].head(n=3)):
    logger.info(f'Item {i+1}:\n{desc}')

[32m2025-11-29 12:47:41.904[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Dark Black (Black)
Category: Ladieswear = Womens Everyday Basics - Jersey Basic
Details Jersey top with narrow shoulder straps.[0m
[32m2025-11-29 12:47:41.905[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Light White (White)
Category: Ladieswear = Womens Everyday Basics - Jersey Basic
Details Jersey top with narrow shoulder straps.[0m
[32m2025-11-29 12:47:41.906[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 3:
Strap top (1) - Vest top in Garment Upper body
Appearance: Stripe
Color: Dusty Light White (Off White)
Category: Ladieswear = Womens Everyday Basics - Jersey Basic
Details Jersey top with narrow shoulder straps.[0m


In [46]:
device = (
    'cuda'
    if torch.cuda.is_available()
    else 'mps'
    if torch.backends.mps.is_available()
    else 'cpu'
)
logger.info("Loading '{FEATURES_EMBEDDING_MODEL_ID}' model to {device}...")

model = SentenceTransformer(FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2025-11-29 12:51:15.230[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading '{FEATURES_EMBEDDING_MODEL_ID}' model to {device}...[0m


In [49]:
# Utility function to generate the embeddings and add it to the dataframe
def generate_embeddings_for_dataframe(
    df, text_column: str, model: SentenceTransformer, batch_size: int = 32
):
    total_rows = len(df)
    pbar = tqdm(total=total_rows, desc='Generating embeddings')

    texts: List[str] = df[text_column].tolist()

    text_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        batch_embeddings = model.encode(
            batch_texts,
            show_progress_bar=False,
            device=device,
        )
        text_embeddings.extend(batch_embeddings)
        pbar.update(len(batch_texts))

    pbar.close()
    df_with_embeddings = df.copy()
    df_with_embeddings['embeddings'] = text_embeddings

    return df_with_embeddings

In [50]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, text_column='article_description', model=model, batch_size=128
)

Generating embeddings: 100%|██████████| 105542/105542 [01:54<00:00, 922.85it/s]


Now, for each article description, we have the vector embeddings which we can feed into the model.

In [55]:
articles_df[['article_description', 'embeddings']].head(3)

Unnamed: 0,article_description,embeddings
0,Strap top - Vest top in Garment Upper body\nAp...,"[-0.028278721, 0.06428172, -0.01930175, 0.0155..."
1,Strap top - Vest top in Garment Upper body\nAp...,"[-0.00960519, 0.07461483, -0.00093427574, 0.01..."
2,Strap top (1) - Vest top in Garment Upper body...,"[-0.0335716, 0.084523894, 0.0074015465, -0.007..."


Next, we pickle it to save the processed articles data.

In [56]:
articles_df.to_pickle(data_path / 'processed' / 'articles.pkl')