In [2]:
import time

notebook_start_time = time.time()

In [11]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/decodingml/hands-on-recommender-system.git
    %cd hands-on-recommender-system/


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()

    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path().absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

⛳️ Local environment
Adding the following directory to the PYTHONPATH: /Users/xayhanmonty/Desktop/smart-fit-recs


In [13]:
%load_ext autoreload
%autoreload 2

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys.config import settings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from recsys.raw_data_sources import h_m as h_and_m_raw_data

# Extract articles data
articles_df = h_and_m_raw_data.extract_articles_df()
logger.info(f"Articles shape: {articles_df.shape}")
logger.info(articles_df.head())


[32m2025-07-24 16:38:54.550[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mArticles shape: (105542, 25)[0m
[32m2025-07-24 16:38:54.551[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mshape: (5, 25)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ article_i ┆ product_c ┆ prod_name ┆ product_t ┆ … ┆ section_n ┆ garment_g ┆ garment_g ┆ detail_d │
│ d         ┆ ode       ┆ ---       ┆ ype_no    ┆   ┆ ame       ┆ roup_no   ┆ roup_name ┆ esc      │
│ ---       ┆ ---       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│ i64       ┆ i64       ┆           ┆ i64       ┆   ┆ str       ┆ i64       ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 108775015 ┆ 108775    ┆ Strap top ┆ 253       ┆ … ┆ Womens    ┆ 1002      ┆ Jersey    ┆ Jersey   │
│           ┆           ┆      