In [1]:
import numpy as np
import polars as pl

# Data exploration for RecSys 2025

For the purpose of the efficiency and processing multiple datasets with millions of rows, we decided to use **Polars** instead of Pandas, because of its Lazy API, which enables us to better manage memory.


In [2]:
data_folder = "../data"

data = {
    "add_to_cart": pl.scan_parquet(f"{data_folder}/add_to_cart.parquet", try_parse_hive_dates=True),
    "page_visit": pl.scan_parquet(f"{data_folder}/page_visit.parquet"),
    "product_buy": pl.scan_parquet(f"{data_folder}/product_buy.parquet"),
    "product_properties": pl.scan_parquet(f"{data_folder}/product_properties.parquet"),
    "remove_from_cart": pl.scan_parquet(f"{data_folder}/remove_from_cart.parquet"),
    "search_query": pl.scan_parquet(f"{data_folder}/search_query.parquet"),
}

inputs = {
    "relevant_clients": np.load(f"{data_folder}/input/relevant_clients.npy", mmap_mode="r")
}

targets = {
    "active_clients": np.load(f"{data_folder}/target/active_clients.npy", mmap_mode="r"),
    "popularity_propensity_category": np.load(f"{data_folder}/target/popularity_propensity_category.npy", mmap_mode="r"),
    "popularity_propensity_sku": np.load(f"{data_folder}/target/popularity_propensity_sku.npy", mmap_mode="r"),
    "propensity_category": np.load(f"{data_folder}/target/propensity_category.npy", mmap_mode="r"),
    "propensity_sku": np.load(f"{data_folder}/target/propensity_sku.npy", mmap_mode="r"),
}


In [3]:
for df in data:
    print(f"{df}:")
    print(data[df].head())
    print(data[df].describe())

add_to_cart:
naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  Parquet SCAN [../data/add_to_cart.parquet]
  PROJECT */3 COLUMNS
shape: (9, 4)
┌────────────┬─────────────┬─────────────────────┬───────────────┐
│ statistic  ┆ client_id   ┆ timestamp           ┆ sku           │
│ ---        ┆ ---         ┆ ---                 ┆ ---           │
│ str        ┆ f64         ┆ str                 ┆ f64           │
╞════════════╪═════════════╪═════════════════════╪═══════════════╡
│ count      ┆ 5.235882e6  ┆ 5235882             ┆ 5.235882e6    │
│ null_count ┆ 0.0         ┆ 0                   ┆ 0.0           │
│ mean       ┆ 1.1941e7    ┆ null                ┆ 748255.043147 │
│ std        ┆ 6.8914e6    ┆ null                ┆ 433663.464805 │
│ min        ┆ 5.0         ┆ 2022-05-23 00:10:15 ┆ 0.0           │
│ 25%        ┆ 5.964675e6  ┆ null                ┆ 370909.0      │
│ 50%        ┆ 1.194213e7  ┆ null                ┆ 748335.0      

In [None]:
clients = pl.concat([data[df].select("client_id") for df in data if df != "product_properties"])
unique_clients = clients.select(pl.col("client_id").n_unique()).collect().item()
print(f"Unique clients: {unique_clients}")

Unique clients: 18889063
