In [1]:
from pathlib import Path
import json
import re
import pandas as pd
import numpy as np

DATA_DIR = Path("../data")
INTERIM_DIR = DATA_DIR / "interim"

ORDERS_PATH = INTERIM_DIR / "orders_landing.parquet"
SKU_LOOKUP_PATH = INTERIM_DIR / "sku_lookup.json"

OUT_LINES_PATH = INTERIM_DIR / "order_lines_canonical.parquet"

print("Orders:", ORDERS_PATH)
print("SKU lookup:", SKU_LOOKUP_PATH)
print("Output:", OUT_LINES_PATH)


Orders: ../data/interim/orders_landing.parquet
SKU lookup: ../data/interim/sku_lookup.json
Output: ../data/interim/order_lines_canonical.parquet


In [2]:
orders = pd.read_parquet(ORDERS_PATH)

with open(SKU_LOOKUP_PATH, "r", encoding="utf-8") as f:
    sku_lookup = json.load(f)

orders.shape, len(sku_lookup)


((416082, 10), 1112)

In [5]:
orders.columns.tolist()

pd.set_option("display.max_colwidth", None)

In [8]:
orders.sample(2, random_state=42).T


Unnamed: 0,265873,291138
kolejności,250551533,259959582
Kwota,99.0,47.72
Produkty,Szklany dzbanek filtrujący Dafi CRYSTAL LED 2 l stalowy wkład Classic (x1),"Butelka filtrująca Dafi SOFT 0,5 l czarna + filtr węglowy (x1)ZESTAW 3 filtry do butelki filtrującej Dafi SOFT i SOLID czarny (x1)"
Kod,97-352,97-400
Miejscowość,Dąbrowa,Bełchatów
Anon,ANON_0221524,ANON_0243426
Ilość zakupów,2,1
Data zakupu,1/31/25,3/27/25
Źródło,ALLEGRO,ALLEGRO
produkty_clean,szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic (x1),"butelka filtrująca dafi soft 0,5 l czarna + filtr węglowy (x1)zestaw 3 filtry do butelki filtrującej dafi soft i solid czarny (x1)"


In [9]:
COL_ORDER_ID = "kolejności"
COL_PRODUCTS_RAW = "Produkty"
COL_PRODUCTS_CLEAN = "produkty_clean"
COL_CUSTOMER_ID = "Anon"
COL_DATE = "Data zakupu"
COL_AMOUNT = "Kwota"
COL_SOURCE = "Źródło"


In [10]:
# show 5 lookup keys
list(sku_lookup.keys())[:5]


['barbie 3 filtry do butelki filtrującej dafi soft i solid różowy z naklejkami',
 'outlet zestaw 2 filtry + nakrętka do butelki filtrującej dafi soft cytrynowy',
 'outlet zestaw 2 filtry + nakrętka do butelki filtrującej dafi soft czarny',
 'outlet zestaw 2 filtry + nakrętka do butelki filtrującej dafi soft flamingowy',
 'outlet zestaw 2 filtry + nakrętka do butelki filtrującej dafi soft jagodowy']

In [11]:
example = orders.loc[orders.index[0], COL_PRODUCTS_CLEAN]
example


'butelka filtrująca dafi solid 0,7 l szafirowa + filtr węglowy (x1)rurka na filtr do butelki filtrującej dafi solid 0,7 l szafirowym (x1)zestaw 3 filtry do butelki filtrującej dafi soft i solid szafirowy (x1)'

In [12]:
example in sku_lookup


False

In [13]:
QTY_RE = re.compile(r"\(x\s*(\d+)\)", flags=re.IGNORECASE)

def split_by_qty_markers(s: str):
    """
    Splits a cleaned order string into [(item_text, qty), ...]
    using '(xN)' markers. Returns empty list if nothing found.
    """
    if s is None:
        return []
    s = str(s).strip()
    if not s:
        return []

    matches = list(QTY_RE.finditer(s))
    if not matches:
        return []

    out = []
    start = 0
    for m in matches:
        qty = int(m.group(1))
        end = m.end()  # include the marker in the slice end
        chunk = s[start:end].strip()
        # remove marker from the chunk to get just the product name
        name = QTY_RE.sub("", chunk).strip(" -+|,;/")
        out.append((name, qty))
        start = end

    # ignore trailing text after last (xN) for now (we can handle later if needed)
    return out


In [15]:
r = orders.sample(1, random_state=42).iloc[0]
clean = r[COL_PRODUCTS_CLEAN]
raw = r[COL_PRODUCTS_RAW]

print("RAW:", raw)
print("CLEAN:", clean)
print("----")
split_by_qty_markers(clean)


RAW: Szklany dzbanek filtrujący Dafi CRYSTAL LED 2 l stalowy wkład Classic (x1)
CLEAN: szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic (x1)
----


[('szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic', 1)]

In [16]:
parts = split_by_qty_markers(clean)

for name, qty in parts:
    key_exists = name in sku_lookup
    print(f"qty={qty} | key_exists={key_exists} | name='{name[:120]}'")


qty=1 | key_exists=True | name='szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic'


In [17]:
def row_split_match_rate(clean_str: str, lookup: dict) -> tuple[int, int]:
    """
    Returns (matched_items, total_items) for one order after splitting by (xN).
    If no (xN) markers exist, returns (0, 0) to signal 'no split found'.
    """
    parts = split_by_qty_markers(clean_str)
    if not parts:
        return (0, 0)
    total = len(parts)
    matched = sum(1 for name, qty in parts if name in lookup)
    return (matched, total)

matched_items = 0
total_items = 0
orders_with_markers = 0
orders_all_matched = 0
orders_some_unmatched = 0
orders_no_markers = 0

for s in orders[COL_PRODUCTS_CLEAN].astype(str):
    m, t = row_split_match_rate(s, sku_lookup)
    if t == 0:
        orders_no_markers += 1
        continue
    orders_with_markers += 1
    matched_items += m
    total_items += t
    if m == t:
        orders_all_matched += 1
    else:
        orders_some_unmatched += 1

print("Orders total:", len(orders))
print("Orders with (xN) markers:", orders_with_markers)
print("Orders with NO (xN) markers:", orders_no_markers)
print("Orders where ALL split items matched:", orders_all_matched)
print("Orders where SOME split items unmatched:", orders_some_unmatched)
print("Item-level match rate:", matched_items / total_items if total_items else None)


Orders total: 416082
Orders with (xN) markers: 416082
Orders with NO (xN) markers: 0
Orders where ALL split items matched: 410626
Orders where SOME split items unmatched: 5456
Item-level match rate: 0.9933164115888917


In [18]:
unmatched_examples = []

for idx, s in orders[COL_PRODUCTS_CLEAN].astype(str).items():
    parts = split_by_qty_markers(s)
    for name, qty in parts:
        if name not in sku_lookup:
            unmatched_examples.append({
                "order_index": idx,
                "clean_products": s,
                "unmatched_name": name,
                "qty": qty
            })
    if len(unmatched_examples) >= 20:
        break

pd.DataFrame(unmatched_examples)


Unnamed: 0,order_index,clean_products,unmatched_name,qty
0,12,zestaw szklany dzbanek dafi crystal led 2 l makowy + 10 wkładów classic (x1),zestaw szklany dzbanek dafi crystal led 2 l makowy + 10 wkładów classic,1
1,22,"rurka na filtr do butelki filtrującej dafi solid 0,5 l turkusowa (x1)zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa (x1)kubek termiczny self dafi 430 ml dla dzieci przezroczysty termos na kawę (x1)rurka na filtr do butelki filtrującej dafi solid 0,5 l jeansowa (x1)rurka na filtr do butelki filtrującej dafi solid 0,5 l bursztynowa (x2)zestaw 3 filtry do butelki filtrującej dafi soft i solid biały (x1)","zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa",1
2,23,"zestaw 2 butelek dafi: soft 0,7 l miętowa i solid 0,7 l szafirowa (x1)","zestaw 2 butelek dafi: soft 0,7 l miętowa i solid 0,7 l szafirowa",1
3,24,"zestaw butelek filtrujących dafi 3x solid + soft 0,5 l (x1)","zestaw butelek filtrujących dafi 3x solid + soft 0,5 l",1
4,27,"zestaw butelek filtrujących dafi 2x solid 0,7 l + 2x soft 0,5 l (x1)","zestaw butelek filtrujących dafi 2x solid 0,7 l + 2x soft 0,5 l",1
5,28,"zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa (x1)zestaw 8 filtrów do dzbanka filtrującego dafi wkład unimax (x1)zestaw 3 filtry do butelki filtrującej dafi soft i solid turkusowy (x1)","zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa",1
6,32,"zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa (x1)rurka na filtr do butelki filtrującej dafi solid 0,5 l bursztynowa (x1)zakrętka do butelki filtrującej dafi solid uchwyt bursztynowy (x2)zakrętka do butelki filtrującej dafi solid uchwyt jeansowy (x1)","zestaw 2 butelek dafi solid: 0,5 l turkusowa i 0,7 l jeansowa",1
7,34,"butelka filtrująca dafi soft 0,3 l paw patrol skye + filtr węglowy (x1)zestaw 2 filtry + nakrętka do butelki filtrującej dafi soft flamingowy (x1)","butelka filtrująca dafi soft 0,3 l paw patrol skye + filtr węglowy",1
8,44,"zestaw szklany dzbanek dafi crystal led 2 l grafitowy + 10 wkładów classic (x1)zestaw butelek filtrujących dafi 2x solid 0,7 l + 2x soft 0,5 l (x1)zestaw 3 filtry do butelki filtrującej dafi soft i solid biały (x2)","zestaw butelek filtrujących dafi 2x solid 0,7 l + 2x soft 0,5 l",1
9,48,"zestaw butelek filtrujących dafi 3x solid + soft 0,5 l (x1)","zestaw butelek filtrujących dafi 3x solid + soft 0,5 l",1


In [40]:
import re

def extract_category_counts(item_name: str) -> dict:
    s = (item_name or "").lower()

    counts = {
        "bottles": 0,
        "pitchers": 0,
        "filters_pitcher": 0,     # wkłady + filters in pitcher context
        "filters_bottle": 0,      # filters in bottle context
        "filters_unknown": 0,     # ambiguous "filtr"
        "filters_bottle_included": 0,
        "filters_pitcher_included": 0,    # filters included with filtering bottles
        "uncertain": False,
    }

    # =========================
    # Context regexes
    # =========================
    PITCHER_CTX = re.compile(r"\b(dzbanek|dzbank|classic|unimax|aqua|crystal|astra|omega|standard)\b", re.IGNORECASE)
    BOTTLE_CTX  = re.compile(r"\b(butelka|butelk|soft|solid)\b", re.IGNORECASE)

    # =========================
    # Bottles
    # =========================

    # Special bundle rule: "solid + soft" implies 2 bottles
    if (
        "zestaw" in s
        and "butelek" in s
        and "filtruj" in s
        and "solid" in s
        and "soft" in s
        and "+" in s
    ):
        counts["bottles"] = max(counts["bottles"], 2)

    # Explicit numeric bottle count: "2 butelek"
    m = re.search(r"(\d+)\s*(?:x\s*)?(butelk|butele)", s)
    if m:
        counts["bottles"] = max(counts["bottles"], int(m.group(1)))
    else:
        # Avoid "filtry do butelki" false positives
        is_filters_for_bottle = bool(
            re.search(r"(filtr|wkład)[^\n]{0,30}do\s+butelk", s)
        )
        if not is_filters_for_bottle and "butelk" in s:
            counts["bottles"] = max(counts["bottles"], 1)

    # If bottles are filtering bottles → each includes 1 filter
    if counts["bottles"] > 0:
        counts["filters_bottle_included"] += counts["bottles"]

    # =========================
    # Filters / cartridges
    # =========================

    # --- wkłady (always pitcher filters)
    WK_NOUN_RE = re.compile(r"(\d+)\s*(?:x\s*)?(wkład(?:y|ów)?)\b", re.IGNORECASE)
    HAS_WK_NOUN_RE = re.compile(r"\b(wkład(?:y|ów)?)\b", re.IGNORECASE)

    m = WK_NOUN_RE.search(s)
    if m:
        counts["filters_pitcher"] += int(m.group(1))
    elif HAS_WK_NOUN_RE.search(s):
        counts["filters_pitcher"] += 1

    # --- filtry (need context)
    FILTR_NOUN_RE = re.compile(r"(\d+)\s*(?:x\s*)?(filtr(?:y|ów)?)\b", re.IGNORECASE)
    HAS_FILTR_NOUN_RE = re.compile(r"\b(filtr(?:y|ów)?)\b", re.IGNORECASE)

    m = FILTR_NOUN_RE.search(s)
    if m:
        n = int(m.group(1))
        if BOTTLE_CTX.search(s) and not PITCHER_CTX.search(s):
            counts["filters_bottle"] += n
        elif PITCHER_CTX.search(s) and not BOTTLE_CTX.search(s):
            counts["filters_pitcher"] += n
        else:
            counts["filters_unknown"] += n
    else:
        if HAS_FILTR_NOUN_RE.search(s):
            if BOTTLE_CTX.search(s) and not PITCHER_CTX.search(s):
                counts["filters_bottle"] += 1
            elif PITCHER_CTX.search(s) and not BOTTLE_CTX.search(s):
                counts["filters_pitcher"] += 1
            else:
                counts["filters_unknown"] += 1

    # =========================
    # Pitchers / jugs
    # =========================
    m = re.search(r"(\d+)\s*(?:x\s*)?(dzbanek|dzbank)", s)
    if m:
        counts["pitchers"] += int(m.group(1))
    elif "dzbanek" in s or "dzbank" in s:
        counts["pitchers"] += 1
    
    # =========================
    # Uncertainty flag
    # =========================
    if (
        "zestaw" in s
        and counts["bottles"] == 0
        and counts["pitchers"] == 0
        and counts["filters_pitcher"] == 0
        and counts["filters_bottle"] == 0
    ):
        counts["uncertain"] = True

    if counts["pitchers"] > 0:
        counts["filters_pitcher_included"] += counts["pitchers"]

    counts["total_bottle_filters"] = (
    counts["filters_bottle"] + counts["filters_bottle_included"]
    )

    counts["total_pitcher_filters"] = (
    counts["filters_pitcher"] + counts["filters_pitcher_included"]
    )



    return counts


In [42]:
def decode_one_order(row, sku_lookup: dict):
    order_id = row[COL_ORDER_ID]
    anon = row[COL_CUSTOMER_ID]
    dt = row[COL_DATE]
    source = row[COL_SOURCE]
    amount = row[COL_AMOUNT]

    raw = row[COL_PRODUCTS_RAW]
    clean = row[COL_PRODUCTS_CLEAN]

    parts = split_by_qty_markers(clean)

    lines = []
    for item_name, qty in parts:
        if item_name in sku_lookup:
            attrs = sku_lookup[item_name]
            line = {
                "order_id": order_id,
                "anon": anon,
                "date": dt,
                "source": source,
                "amount": amount,
                "raw_products": raw,
                "clean_products": clean,

                "item_name": item_name,
                "qty": qty,

                "sku": attrs.get("sku"),
                "decode_method": "sku_lookup",
                "matched_key": item_name,
            }
        else:
            inferred = extract_category_counts(item_name)
            line = {
                "order_id": order_id,
                "anon": anon,
                "date": dt,
                "source": source,
                "amount": amount,
                "raw_products": raw,
                "clean_products": clean,

                "item_name": item_name,
                "qty": qty,

                "sku": None,
                "decode_method": "regex_infer",
                "matched_key": None,

                # inferred counts
                **inferred,
            }

        lines.append(line)

    return pd.DataFrame(lines)


In [43]:
row = orders.sample(1, random_state=42).iloc[0]
df_one = decode_one_order(row, sku_lookup)
df_one


Unnamed: 0,order_id,anon,date,source,amount,raw_products,clean_products,item_name,qty,sku,decode_method,matched_key
0,250551533,ANON_0221524,1/31/25,ALLEGRO,99.0,Szklany dzbanek filtrujący Dafi CRYSTAL LED 2 l stalowy wkład Classic (x1),szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic (x1),szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic,1,,sku_lookup,szklany dzbanek filtrujący dafi crystal led 2 l stalowy wkład classic


In [45]:
def decode_all_orders(orders: pd.DataFrame, sku_lookup: dict, limit=None):
    out_rows = []
    records = orders.to_dict("records")
    if limit is not None:
        records = records[:limit]

    for row in records:
        # use the same logic as decode_one_order, but append dict rows (faster)
        order_id = row[COL_ORDER_ID]
        anon = row[COL_CUSTOMER_ID]
        dt = row[COL_DATE]
        source = row[COL_SOURCE]
        amount = row[COL_AMOUNT]

        raw = row[COL_PRODUCTS_RAW]
        clean = row[COL_PRODUCTS_CLEAN]

        parts = split_by_qty_markers(clean)

        for item_name, qty in parts:
            base = {
                "order_id": order_id,
                "anon": anon,
                "date": dt,
                "source": source,
                "amount": amount,
                "raw_products": raw,
                "clean_products": clean,
                "item_name": item_name,
                "qty": qty,
            }

            if item_name in sku_lookup:
                attrs = sku_lookup[item_name]
                out_rows.append({
                    **base,
                    "sku": attrs.get("sku"),
                    "decode_method": "sku_lookup",
                    "matched_key": item_name,
                })
            else:
                inferred = extract_category_counts(item_name)
                out_rows.append({
                    **base,
                    "sku": None,
                    "decode_method": "regex_infer",
                    "matched_key": None,
                    **inferred,
                })

    return pd.DataFrame(out_rows)


In [46]:
order_lines_small = decode_all_orders(orders, sku_lookup, limit=2000)
order_lines_small.shape


(3760, 22)

In [47]:
order_lines_small["decode_method"].value_counts()


decode_method
sku_lookup     3646
regex_infer     114
Name: count, dtype: int64

In [48]:
import pyarrow as pa
import pyarrow.parquet as pq

DEFAULT_INFER = extract_category_counts("")  # should return all keys with zeros/False
INFER_KEYS = list(DEFAULT_INFER.keys())
INFER_KEYS


['bottles',
 'pitchers',
 'filters_pitcher',
 'filters_bottle',
 'filters_unknown',
 'filters_bottle_included',
 'filters_pitcher_included',
 'uncertain',
 'total_bottle_filters',
 'total_pitcher_filters']

In [49]:
def decode_orders_chunk(df_chunk: pd.DataFrame, sku_lookup: dict) -> list[dict]:
    out_rows = []
    records = df_chunk.to_dict("records")

    for row in records:
        order_id = row[COL_ORDER_ID]
        anon = row[COL_CUSTOMER_ID]
        dt = row[COL_DATE]
        source = row[COL_SOURCE]
        amount = row[COL_AMOUNT]
        raw = row[COL_PRODUCTS_RAW]
        clean = row[COL_PRODUCTS_CLEAN]

        parts = split_by_qty_markers(clean)

        for item_name, qty in parts:
            base = {
                "order_id": order_id,
                "anon": anon,
                "date": dt,
                "source": source,
                "amount": amount,
                "raw_products": raw,
                "clean_products": clean,
                "item_name": item_name,
                "qty": qty,
            }

            if item_name in sku_lookup:
                attrs = sku_lookup[item_name]
                out_rows.append({
                    **base,
                    "sku": attrs.get("sku"),
                    "decode_method": "sku_lookup",
                    "matched_key": item_name,
                    **DEFAULT_INFER,  # keep schema stable
                })
            else:
                inferred = extract_category_counts(item_name)
                out_rows.append({
                    **base,
                    "sku": None,
                    "decode_method": "regex_infer",
                    "matched_key": None,
                    **inferred,
                })

    return out_rows


In [50]:
CHUNK_SIZE = 20_000

OUT_LINES_PATH.parent.mkdir(parents=True, exist_ok=True)

writer = None
total_orders = len(orders)
total_lines = 0

for start in range(0, total_orders, CHUNK_SIZE):
    end = min(start + CHUNK_SIZE, total_orders)
    chunk = orders.iloc[start:end]

    rows = decode_orders_chunk(chunk, sku_lookup)
    batch_df = pd.DataFrame(rows)

    # Convert to Arrow table
    table = pa.Table.from_pandas(batch_df, preserve_index=False)

    # Create writer on first batch (schema fixed from first batch)
    if writer is None:
        writer = pq.ParquetWriter(str(OUT_LINES_PATH), table.schema)

    writer.write_table(table)
    total_lines += len(batch_df)

    print(f"Processed orders {start}:{end} -> lines {len(batch_df)} | total lines so far: {total_lines}")

if writer is not None:
    writer.close()

print("Saved:", OUT_LINES_PATH)
print("Total orders:", total_orders, "| Total canonical lines:", total_lines)


Processed orders 0:20000 -> lines 40017 | total lines so far: 40017
Processed orders 20000:40000 -> lines 42452 | total lines so far: 82469
Processed orders 40000:60000 -> lines 43088 | total lines so far: 125557
Processed orders 60000:80000 -> lines 40387 | total lines so far: 165944
Processed orders 80000:100000 -> lines 36259 | total lines so far: 202203
Processed orders 100000:120000 -> lines 43291 | total lines so far: 245494
Processed orders 120000:140000 -> lines 43672 | total lines so far: 289166
Processed orders 140000:160000 -> lines 41927 | total lines so far: 331093
Processed orders 160000:180000 -> lines 41341 | total lines so far: 372434
Processed orders 180000:200000 -> lines 40433 | total lines so far: 412867
Processed orders 200000:220000 -> lines 38721 | total lines so far: 451588
Processed orders 220000:240000 -> lines 41347 | total lines so far: 492935
Processed orders 240000:260000 -> lines 41409 | total lines so far: 534344
Processed orders 260000:280000 -> lines 

In [51]:
order_lines = pd.read_parquet(OUT_LINES_PATH)
order_lines.shape


(865553, 22)

In [52]:
order_lines["decode_method"].value_counts()


decode_method
sku_lookup     859768
regex_infer      5785
Name: count, dtype: int64

In [53]:
order_lines.isna().mean().sort_values(ascending=False).head(10)


sku                         1.000000
matched_key                 0.006684
amount                      0.000047
source                      0.000003
pitchers                    0.000000
total_bottle_filters        0.000000
uncertain                   0.000000
filters_pitcher_included    0.000000
filters_bottle_included     0.000000
filters_unknown             0.000000
dtype: float64

In [54]:
order_lines[["total_bottle_filters", "total_pitcher_filters"]].describe()


Unnamed: 0,total_bottle_filters,total_pitcher_filters
count,865553.0,865553.0
mean,0.002663,0.012244
std,0.0695,0.384805
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,12.0,14.0


## Purpose:
Decode raw order product strings into canonical order lines using SKU lookup and controlled regex inference, producing a single analytical dataset for retention, replacement-cycle, and LTV modeling.