## <span style="color:#ff5f27">👩🏻‍🔬 Feature Engineering </span>


In [2]:
import time

# Start the timer
notebook_start_time = time.time()

## <span style="color:#ff5f27">📝 Imports </span>

In [3]:
#!pip install -r requirements.txt --quiet

In [4]:
import random
import polars as pl
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

from features.articles import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from features.customers import CustomerDatasetSize, DatasetSampler, compute_features_customers
from features.transactions import compute_features_transactions, month_cos, month_sin
from features.interaction import generate_interaction_data
from features.ranking import compute_ranking_dataset  

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Data size configuration
CUSTOMER_DATA_SIZE = CustomerDatasetSize.SMALL 

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [6]:
import hopsworks

project = hopsworks.login(api_key_value = "Dkez37cDPamSnJUf.HDsceFNWsdWX9blAXWtJxcez9tYRKw6eDYN2TQ5AbNjr9lrQKlMLB7nAZ2wgGBQd")

fs = project.get_feature_store()



2025-03-15 12:48:34,999 INFO: Initializing external client
2025-03-15 12:48:34,999 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-15 12:48:43,473 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1218722


## <span style="color:#ff5f27">🗄️ Read Articles Data</span>

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [7]:
# Start the timer
start_time = time.time()


# Load articles data
articles_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/articles.csv', try_parse_dates=True)
print(articles_df.shape)
articles_df.head(3)


(105542, 25)


article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


In [8]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 1.28 seconds


In [9]:
# Check for NaNs
articles_df.null_count()

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,416


## <span style="color:#ff5f27">👨🏻‍🏭 Articles Feature Engineering</span>


In [10]:
# Start the timer
start_time = time.time()


articles_df = compute_features_articles(articles_df)
articles_df.head(3)


article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description,image_url
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""https://repo.hops.works/dev/jd…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""https://repo.hops.works/dev/jd…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…","""https://repo.hops.works/dev/jd…"


In [11]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 0.69 seconds


In [12]:
print(articles_df['article_description'][0])

Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Dark Black (Black)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.


## <span style="color:#ff5f27">🧬 Embeddings Creation</span>

In [13]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Load the embedding model.
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

2025-03-15 12:48:47,232 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [14]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, "article_description", model, batch_size=128
)  # Reduce batch size if getting OOM errors.

Generating embeddings: 100%|██████████| 105542/105542 [46:22<00:00, 37.94it/s] 


In [15]:
articles_df[['article_description', 'embeddings']].head(3)

article_description,embeddings
str,list[f64]
"""Strap top - Vest top in Garmen…","[-0.026782, 0.082344, … 0.022782]"
"""Strap top - Vest top in Garmen…","[-0.010396, 0.089874, … 0.022564]"
"""Strap top (1) - Vest top in Ga…","[-0.032753, 0.091124, … 0.022804]"


## <span style="color:#ff5f27">🔗 Image Links</span>

In [16]:
articles_df["image_url"][0]

'https://repo.hops.works/dev/jdowling/h-and-m/images/010/0108775015.jpg'

In [17]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

---
## <span style="color:#ff5f27">🗄️ Read Customers Data</span>

In [18]:
# Start the timer
start_time = time.time()


# Load customers data
customers_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/customers.csv', try_parse_dates=True)
print(customers_df.shape)
customers_df.head(3)

(1371980, 7)


customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [19]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 5.91 seconds


## <span style="color:#ff5f27">👨🏻‍🏭 Customers Feature Engineering</span>


In [20]:
# Start the timer
start_time = time.time()


customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.head(3)

customer_id,club_member_status,age,postal_code,age_group
str,str,f64,str,str
"""00000dbacae5abe5e23885899a1fa4…","""ACTIVE""",49.0,"""52043ee2162cf5aa7ee79974281641…","""46-55"""
"""0000423b00ade91418cceaf3b26c6a…","""ACTIVE""",25.0,"""2973abc54daa8a5f8ccfe9362140c6…","""19-25"""
"""000058a12d5b43e67d225668fa1f8d…","""ACTIVE""",24.0,"""64f17e6a330a85798e4998f62d0930…","""19-25"""


In [21]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 0.43 seconds


---
## <span style="color:#ff5f27">🗄️ Read Transactions Data</span>

In [22]:
# Start the timer
start_time = time.time()


transactions_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/transactions_train.csv', try_parse_dates=True)
print(transactions_df.shape)
transactions_df.head(3)

(31788324, 5)


t_dat,customer_id,article_id,price,sales_channel_id
date,str,i64,f64,i64
2018-09-20,"""000058a12d5b43e67d225668fa1f8d…",663713001,0.050831,2
2018-09-20,"""000058a12d5b43e67d225668fa1f8d…",541518023,0.030492,2
2018-09-20,"""00007d2de826758b65a93dd24ce629…",505221004,0.015237,2


In [23]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 109.24 seconds


## <span style="color:#ff5f27">👨🏻‍🏭 Transactions Feature Engineering</span>

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

In [24]:
# Start the timer
start_time = time.time()


transactions_df = compute_features_transactions(transactions_df)
transactions_df.head(3)

t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week
i64,str,str,f64,i64,i32,i8,i8,i8
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""663713001""",0.050831,2,2018,9,20,4
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""541518023""",0.030492,2,2018,9,20,4
1537401600000,"""00007d2de826758b65a93dd24ce629…","""505221004""",0.015237,2,2018,9,20,4


In [25]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 2.32 seconds


## <span style="color:#ff5f27">✂️ Data Sampling</span>


In [26]:
sampler = DatasetSampler(size=CUSTOMER_DATA_SIZE)
dataset_subset = sampler.sample(
    customers_df=customers_df, transations_df=transactions_df
)
customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

✂️ Sampling 1000 customers.
⛳️ Number of transactions for all the customers: 31788324
⛳️ Number of transactions for the 1000 sampled customers: 23799


---

## <span style="color:#ff5f27">🤳🏻 Interaction Data</span>


In [27]:
# Generate the interaction data
interaction_df = generate_interaction_data(transactions_df)

print(interaction_df.shape)
interaction_df.head()

Processing customer chunks: 100%|██████████| 1/1 [00:14<00:00, 14.81s/it]


(135046, 5)


t_dat,customer_id,article_id,interaction_score,prev_article_id
i64,str,str,i64,str
1539316800000,"""00b203a32faa3d007dba198ef27c15…","""617919021""",1,"""START"""
1539320400000,"""00b203a32faa3d007dba198ef27c15…","""617919021""",1,"""617919021"""
1539475200000,"""00b203a32faa3d007dba198ef27c15…","""617919021""",2,"""617919021"""
1539676800000,"""00b203a32faa3d007dba198ef27c15…","""639576002""",1,"""617919021"""
1539694800000,"""00b203a32faa3d007dba198ef27c15…","""675508001""",1,"""639576002"""


In [28]:
interaction_df.group_by('interaction_score').agg(pl.count('interaction_score').alias('total_interactions'))

interaction_score,total_interactions
i64,u32
0,72998
1,38249
2,23799


Here is what each score means:

- `0` : No interaction between a customer and an item
- `1` : A customer clicked an item
- `2` : A customer bought an item

---

## <span style="color:#ff5f27">🪄 Feature Group Creation </span>

A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features.

Before you can create a feature group you need to connect to your feature store.

To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group.

### <span style="color:#ff5f27">⛳️ Customers </span>


In [29]:
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    description="Customers data including age and postal code",
    version=1,
    primary_key=["customer_id"],
    online_enabled=True,
)

Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).

At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you populate it with its associated data using the `insert` method.

In [30]:
customers_fg.insert(customers_df)
print('✅ Done!')

Uploading Dataframe: 100.00% |██████████| Rows 1000/1000 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: customers_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1218722/jobs/named/customers_1_offline_fg_materialization/executions
✅ Done!


In [31]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "club_member_status", "description": "Membership status of the customer in the club."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "postal_code", "description": "Postal code associated with the customer's address."},
    {"name": "age_group", "description": "Categorized age group of the customer."},
]

for desc in feature_descriptions: 
    customers_fg.update_feature_description(desc["name"], desc["description"])

Let's do the same thing for the rest of the data frames.

### <span style="color:#ff5f27">⛳️ Transactions </span>


In [32]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transactions data including customer, item, price, sales channel and transaction date",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    transformation_functions=[month_sin, month_cos],
    event_time="t_dat",
)
trans_fg.insert(transactions_df)
print('✅ Done!')

Uploading Dataframe: 100.00% |██████████| Rows 23799/23799 | Elapsed Time: 00:04 | Remaining Time: 00:00


Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1218722/jobs/named/transactions_1_offline_fg_materialization/executions
✅ Done!


In [33]:
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the data record."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "price", "description": "Price of the purchased article."},
    {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
    {"name": "year", "description": "Year of the transaction."},
    {"name": "month", "description": "Month of the transaction."},
    {"name": "day", "description": "Day of the transaction."},
    {"name": "day_of_week", "description": "Day of the week of the transaction."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
]

for desc in feature_descriptions: 
    trans_fg.update_feature_description(desc["name"], desc["description"])

### <span style="color:#ff5f27">⛳️ Interactions </span>


In [34]:
# Create Interactions Feature Group
interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    description="Customer interactions with articles including purchases, clicks, and ignores. Used for building recommendation systems and analyzing user behavior.",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)

# Insert the data
interactions_fg.insert(interaction_df)
print('✅ Done!')

Uploading Dataframe: 100.00% |██████████| Rows 135046/135046 | Elapsed Time: 00:10 | Remaining Time: 00:00


Launching job: interactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1218722/jobs/named/interactions_1_offline_fg_materialization/executions
✅ Done!


In [35]:
# Define feature descriptions for interactions
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the interaction."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the article that was interacted with."},
    {"name": "interaction_score", "description": "Type of interaction: 0 = ignore, 1 = click, 2 = purchase."},
    {"name": "prev_article_id", "description": "Previous article that the customer interacted with, useful for sequential recommendation patterns."}
]

# Update feature descriptions
for desc in feature_descriptions:
    interactions_fg.update_feature_description(desc["name"], desc["description"])

### <span style="color:#ff5f27">⛳️ Articles </span>


In [36]:
from hsfs.feature import Feature

features = [
    Feature(name='article_id', type='string', description="Identifier for the article."),
    Feature(name='product_code', type='bigint', description="Code associated with the product."),
    Feature(name='prod_name', type='string', description="Name of the product."),
    Feature(name='product_type_no', type='bigint', description="Number associated with the product type."),
    Feature(name='product_type_name', type='string', description="Name of the product type."),
    Feature(name='product_group_name', type='string', description="Name of the product group."),
    Feature(name='graphical_appearance_no', type='bigint', description="Number associated with graphical appearance."),
    Feature(name='graphical_appearance_name', type='string', description="Name of the graphical appearance."),
    Feature(name='colour_group_code', type='bigint', description="Code associated with the colour group."),
    Feature(name='colour_group_name', type='string', description="Name of the colour group."),
    Feature(name='perceived_colour_value_id', type='bigint', description="ID associated with perceived colour value."),
    Feature(name='perceived_colour_value_name', type='string', description="Name of the perceived colour value."),
    Feature(name='perceived_colour_master_id', type='bigint', description="ID associated with perceived colour master."),
    Feature(name='perceived_colour_master_name', type='string', description="Name of the perceived colour master."),
    Feature(name='department_no', type='bigint', description="Number associated with the department."),
    Feature(name='department_name', type='string', description="Name of the department."),
    Feature(name='index_code', type='string', description="Code associated with the index."),
    Feature(name='index_name', type='string', description="Name of the index."),
    Feature(name='index_group_no', type='bigint', description="Number associated with the index group."),
    Feature(name='index_group_name', type='string', description="Name of the index group."),
    Feature(name='section_no', type='bigint', description="Number associated with the section."),
    Feature(name='section_name', type='string', description="Name of the section."),
    Feature(name='garment_group_no', type='bigint', description="Number associated with the garment group."),
    Feature(name='garment_group_name', type='string', description="Name of the garment group."),
    Feature(name='prod_name_length', type='bigint', description="Length of the product name."),
    Feature(name='article_description', type='string', online_type="VARCHAR(5800)", description="Description of the article."),
    Feature(name='embeddings', type='array<double>', description="Vector embeddings of the article description."),
    Feature(name='image_url', type='string', description="URL of the product image."),
]

In [37]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings", 
    model.get_sentence_embedding_dimension(),
)

In [38]:
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    version=1,
    description="Fashion items data including type of item, visual description and category",
    primary_key=["article_id"],
    online_enabled=True,
    features=features,
    embedding_index=emb,
)
articles_fg.insert(
    articles_df,
    write_options={"wait_for_job": True},
)
print('✅ Done!')

Uploading Dataframe: 100.00% |██████████| Rows 105542/105542 | Elapsed Time: 02:32 | Remaining Time: 00:00


Launching job: articles_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1218722/jobs/named/articles_1_offline_fg_materialization/executions
2025-03-15 13:41:56,272 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-03-15 13:41:59,459 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-03-15 13:42:56,435 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-03-15 13:52:17,181 INFO: Waiting for log aggregation to finish.
2025-03-15 13:53:15,416 INFO: Execution finished successfully.
✅ Done!


## <span style="color:#ff5f27">📊 Ranking Dataset </span>


In [39]:
# Start the timer
start_time = time.time()

In [40]:
ranking_df = compute_ranking_dataset(
    trans_fg,
    articles_fg,
    customers_fg,
)
ranking_df.head(3)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.84s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.06s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.60s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (44.36s) 


customer_id,age,article_id,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
str,f64,str,i32,str,str,str,str,str,str,str,str,str,str,str
"""d63bcb721eda840aef84cd36c391a6…",22.0,"""865624003""",1,"""Shorts""","""Garment Lower body""","""All over pattern""","""Other Orange""","""Light""","""Orange""","""Shorts""","""Divided""","""Divided""","""Divided Collection""","""Shorts"""
"""104ee8cfc19a55dc2389b677de3b3b…",33.0,"""554450027""",1,"""Trousers""","""Garment Lower body""","""Solid""","""Dark Grey""","""Dark""","""Grey""","""Trousers""","""Divided""","""Divided""","""Divided Collection""","""Trousers"""
"""4314540251fce1cb1dbbecae592f95…",26.0,"""651369001""",1,"""Trousers""","""Garment Lower body""","""Solid""","""Black""","""Dark""","""Black""","""Jersey""","""Ladieswear""","""Ladieswear""","""Womens Casual""","""Jersey Fancy"""


In [41]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 60.02 seconds


In [42]:
ranking_df.get_column("label").value_counts()

label,count
i32,u32
1,21425
0,214250


In [43]:
rank_fg = fs.get_or_create_feature_group(
    name="ranking",
    version=1,
    description="Derived feature group for ranking",
    primary_key=["customer_id", "article_id"], 
    parents=[articles_fg, customers_fg, trans_fg],
)
rank_fg.insert(ranking_df)
print('✅ Done!')

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1218722/fs/1206352/fg/1418653


Uploading Dataframe: 100.00% |██████████| Rows 235675/235675 | Elapsed Time: 00:37 | Remaining Time: 00:00


Launching job: ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1218722/jobs/named/ranking_1_offline_fg_materialization/executions
✅ Done!


In [44]:
ranking_feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {"name": "graphical_appearance_name", "description": "Name of the graphical appearance."},
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {"name": "perceived_colour_value_name", "description": "Name of the perceived colour value."},
    {"name": "perceived_colour_master_name", "description": "Name of the perceived colour master."},
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {"name": "label", "description": "Label indicating whether the article was purchased (1) or not (0)."},
]

You should now be able to inspect the feature groups in the Hopsworks UI.

---

In [45]:
# End the timer
notebook_end_time = time.time()

# Calculate and print the execution time
notebook_execution_time = notebook_end_time - notebook_start_time
print(f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds")

⌛️ Notebook Execution time: 4037.47 seconds


---
## <span style="color:#ff5f27">⏩️ Next Steps </span>
In the next notebook you'll train a retrieval model.