In [119]:
import pandas as pd
import numpy as np

PATH = "../data/interim/order_lines_canonical.parquet"
ol = pd.read_parquet(PATH)


pd.set_option("display.max_colwidth", None)

ol.shape, ol.columns.tolist()[:30]


((865553, 22),
 ['order_id',
  'anon',
  'date',
  'source',
  'amount',
  'raw_products',
  'clean_products',
  'item_name',
  'qty',
  'sku',
  'decode_method',
  'matched_key',
  'bottles',
  'pitchers',
  'filters_pitcher',
  'filters_bottle',
  'filters_unknown',
  'filters_bottle_included',
  'filters_pitcher_included',
  'uncertain',
  'total_bottle_filters',
  'total_pitcher_filters'])

In [120]:
sku_map = pd.read_parquet("../data/interim/sku_map.parquet")
sku_map.shape, sku_map.columns.tolist()


((1112, 8),
 ['produkty_clean',
  'Produkty',
  'RODZAJ',
  'ILO≈öƒÜ FILTR√ìW',
  'BAZOWY',
  'MATRIX NAZWA',
  'MATRIX GRUPA PRODUKTOWA',
  'CZY JEST W MATRIXIE'])

In [121]:
# choose matrix columns you want to carry forward
MATRIX_COLS = [
    "MATRIX NAZWA",
    "MATRIX GRUPA PRODUKTOWA",
    'ILO≈öƒÜ FILTR√ìW',
    

]

# keep only the join key + matrix columns, and deduplicate
sku_dim = (
    sku_map[["produkty_clean"] + MATRIX_COLS]
    .drop_duplicates("produkty_clean")
)

ol2 = ol.merge(
    sku_dim,
    how="left",
    left_on="matched_key",
    right_on="produkty_clean",
)

ol2[["matched_key", "produkty_clean"] + MATRIX_COLS].head(10)


Unnamed: 0,matched_key,produkty_clean,MATRIX NAZWA,MATRIX GRUPA PRODUKTOWA,ILO≈öƒÜ FILTR√ìW
0,"butelka filtrujƒÖca dafi solid 0,7 l szafirowa + filtr wƒôglowy","butelka filtrujƒÖca dafi solid 0,7 l szafirowa + filtr wƒôglowy","SOLID 0,7 1F",03_butelki filtrujƒÖce SOLID,1.0
1,"rurka na filtr do butelki filtrujƒÖcej dafi solid 0,7 l szafirowym","rurka na filtr do butelki filtrujƒÖcej dafi solid 0,7 l szafirowym","RURKA TRITANOWA DO SOLID 0,7",07_akcesoria do Soft/Solid,
2,zestaw 3 filtry do butelki filtrujƒÖcej dafi soft i solid szafirowy,zestaw 3 filtry do butelki filtrujƒÖcej dafi soft i solid szafirowy,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,3.0
3,"butelka filtrujƒÖca dafi soft 0,5 l limonkowa + filtr wƒôglowy","butelka filtrujƒÖca dafi soft 0,5 l limonkowa + filtr wƒôglowy","SOFT 0,5 1F",02_butelki filtrujƒÖce SOFT,1.0
4,"butelka filtrujƒÖca dafi soft 0,5 l niebia≈Ñska + filtr wƒôglowy","butelka filtrujƒÖca dafi soft 0,5 l niebia≈Ñska + filtr wƒôglowy","SOFT 0,5 1F",02_butelki filtrujƒÖce SOFT,1.0
5,"dafi przep≈Çywowy podgrzewacz wody nadumywalkowy 3,7 kw z bateriƒÖ bia≈ÇƒÖ","dafi przep≈Çywowy podgrzewacz wody nadumywalkowy 3,7 kw z bateriƒÖ bia≈ÇƒÖ",PRZEP≈ÅYWOWY PODGRZEWACZ - NADUMYLAWKOWY,26_podgrzewacze przep≈Çywowe,
6,zakrƒôtka do butelki filtrujƒÖcej dafi solid uchwyt flamingowy,zakrƒôtka do butelki filtrujƒÖcej dafi solid uchwyt flamingowy,ZAKRƒòTKA DO SOLID UCHWYT / PRZYCISK / NULL,07_akcesoria do Soft/Solid,
7,"rurka na filtr do butelki filtrujƒÖcej dafi solid 0,5 l flamingowa","rurka na filtr do butelki filtrujƒÖcej dafi solid 0,5 l flamingowa","RURKA TRITANOWA DO SOLID 0,5",07_akcesoria do Soft/Solid,
8,"rurka na filtr do butelki filtrujƒÖcej dafi solid 0,5 l turkusowa","rurka na filtr do butelki filtrujƒÖcej dafi solid 0,5 l turkusowa","RURKA TRITANOWA DO SOLID 0,5",07_akcesoria do Soft/Solid,
9,zestaw 3 filtry do butelki filtrujƒÖcej dafi soft i solid turkusowy,zestaw 3 filtry do butelki filtrujƒÖcej dafi soft i solid turkusowy,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,3.0


In [122]:
len(ol2)

865553

In [123]:
df = ol2.copy()

# --- initialize matrix_source ONCE (do NOT overwrite later) ---
df["matrix_source"] = np.where(
    df["decode_method"].eq("sku_lookup"),
    "sku_lookup",
    "unmapped_inferred",
)


In [124]:
BOTTLE_FILTER_GROUP = "06_filtry do butelek Soft i Solid"
PITCHER_FILTER_GROUP = "__PITCHER_FILTERS__"


In [125]:
mask_pure_bottle_filters = (
    mask_inferred
    & (df["filters_bottle"] > 0)
    & (df["filters_bottle_included"] == 0)
    & (df["filters_pitcher"] == 0)
    & (df["filters_pitcher_included"] == 0)
)

mask_pure_bottle_filters.sum()


18

In [126]:
df.loc[mask_pure_bottle_filters, "MATRIX GRUPA PRODUKTOWA"] = (
    "06_filtry do butelek Soft i Solid"
)

df.loc[mask_pure_bottle_filters, "MATRIX NAZWA"] = "1 FILTR BUTELKOWY"


In [127]:
df.loc[mask_pure_bottle_filters, "matrix_source"] = "rules_backfilled_pure_filters"


In [128]:
df.loc[
    mask_pure_bottle_filters,
    [
        "item_name",
        "filters_bottle",
        "filters_bottle_included",
        "filters_pitcher",
        "MATRIX NAZWA",
        "MATRIX GRUPA PRODUKTOWA",
        "matrix_source",
    ],
].head(10)


Unnamed: 0,item_name,filters_bottle,filters_bottle_included,filters_pitcher,MATRIX NAZWA,MATRIX GRUPA PRODUKTOWA,matrix_source
16127,3 filtry do butelki dafi soft i solid bia≈Çe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
16718,3 filtry do butelki dafi soft i solid flamingowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
17340,3 filtry do butelki dafi soft i solid bursztynowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
17613,3 filtry do butelki dafi soft i solid jagodowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
17706,3 filtry do butelki dafi soft i solid szafirowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
18173,3 filtry do butelki dafi soft i solid bia≈Çe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
18495,3 filtry do butelki dafi soft i solid cytrynowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
18684,3 filtry do butelki dafi soft i solid turkusowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
18685,3 filtry do butelki dafi soft i solid waniliowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters
19019,3 filtry do butelki dafi soft i solid szafirowe,3,0,0,1 FILTR BUTELKOWY,06_filtry do butelek Soft i Solid,rules_backfilled_pure_filters


In [129]:
retention_df = df[
    df["MATRIX GRUPA PRODUKTOWA"].notna()
    & (df["MATRIX GRUPA PRODUKTOWA"] != "__INFERRED__")
]


In [130]:
df["units_purchased"] = df["qty"].astype(float)


In [131]:
mask_sku = df["matrix_source"].eq("sku_lookup")

df.loc[mask_sku, "matrix_qty"] = (
    df.loc[mask_sku, "qty"]
    * df.loc[mask_sku, "ILO≈öƒÜ FILTR√ìW"].fillna(1)
)


In [132]:
mask_sku = df["matrix_source"].eq("sku_lookup")

df.loc[mask_sku, "matrix_qty"] = (
    df.loc[mask_sku, "qty"]
    * df.loc[mask_sku, "ILO≈öƒÜ FILTR√ìW"].fillna(1)
)


'''Important note on matrix_qty
matrix_qty represents the number of MATRIX units associated with a single row
(derived from SKU lookup or rule-based inference).
This column must not be interpreted as a per-event delta once cumulative
quantities are constructed later. Retention logic must explicitly derive
event-level quantities from cumulative positions.'''

'Important note on matrix_qty\nmatrix_qty represents the number of MATRIX units associated with a single row\n(derived from SKU lookup or rule-based inference).\nThis column must not be interpreted as a per-event delta once cumulative\nquantities are constructed later. Retention logic must explicitly derive\nevent-level quantities from cumulative positions.'

In [133]:
mask_rules = df["matrix_source"].eq("rules_backfilled_pure_filters")

df.loc[mask_rules, "matrix_qty"] = df.loc[mask_rules, "total_bottle_filters"]


In [134]:
df["matrix_source"] = np.where(
    df["decode_method"].eq("sku_lookup"),
    "sku_lookup",
    "unmapped_inferred"
)

# keep your special backfilled label if you already set it earlier
# (only overwrite where it isn't already backfilled)
df.loc[df["matrix_source"].ne("sku_lookup") & df["matrix_source"].ne("rules_backfilled_pure_filters"), "matrix_source"] = "unmapped_inferred"

df["matrix_source"].value_counts(dropna=False)


matrix_source
sku_lookup           859768
unmapped_inferred      5785
Name: count, dtype: int64

In [135]:
cust_day_group["matrix_qty"].value_counts(dropna=False).head(10)


matrix_qty
1.0     269193
2.0     124081
6.0      84673
3.0      71490
4.0      34260
9.0      26675
12.0     19551
5.0      16808
8.0       7849
11.0      5217
Name: count, dtype: int64

## Building `cust_day_group`: customer √ó day √ó product-group timeline

At this stage, we construct a **retention-ready event table** that captures *when* a customer acquires a given **product group** and *in what quantity*.

This table is the foundation for:
- inter-purchase interval analysis
- replacement-cycle estimation
- churn / retention modeling
- downstream LTV features

### Key design decisions

**1. Retention-relevant rows only**

Not every decoded order line should participate in retention analysis.  
We explicitly restrict to rows that represent **true product acquisition**:

- `matrix_source == "sku_lookup"`  
  ‚Üí SKU-matched products with trusted matrix metadata
- `matrix_source == "rules_backfilled_pure_filters"`  
  ‚Üí non-SKU rows representing *pure* bottle-filter purchases (deterministically inferred)

Accessories, ambiguous items, and non-SKU devices are excluded from retention modeling but remain in the canonical dataset for auditability.

This logic is captured by the boolean flag:
```python
is_retention_relevant


In [136]:
cust_day_group = (
    df[df["is_retention_relevant"]]
    .groupby(
        ["anon", "date", "MATRIX GRUPA PRODUKTOWA"],
        as_index=False
    )
    .agg({
        "matrix_qty": "sum",
        "ILO≈öƒÜ FILTR√ìW": "sum",
        "MATRIX NAZWA": "first",
    })
    .sort_values(["anon", "date"])
)



cust_day_group.head(10), cust_day_group.shape


KeyError: 'is_retention_relevant'

In [None]:
# how many real purchase rows do we have?
cust_day_group["is_purchase"] = cust_day_group["matrix_qty"] > 0

cust_day_group["is_purchase"].value_counts(normalize=True)


In [None]:
# Notebook 04 ‚Äî FINAL CELL

from pathlib import Path

OUT = Path("../data/interim")
OUT.mkdir(parents=True, exist_ok=True)

path = OUT / "cust_day_group.parquet"
cust_day_group.to_parquet(path, index=False)

print(f"Saved cust_day_group ‚Üí {path}")
print("Rows:", len(cust_day_group))
print("Columns:", cust_day_group.columns.tolist())


## üì¶ Understanding `matrix_qty` and its role in retention analysis

At this stage of the pipeline, we introduce `matrix_qty` as the **canonical quantity measure** used throughout retention and lifecycle analysis.

### What `matrix_qty` represents

`matrix_qty` answers the question:

> **‚ÄúHow many usable MATRIX units does this row represent?‚Äù**

It is a **normalised quantity** derived from:
- SKU-level metadata (e.g. number of filters per SKU),
- multi-pack products,
- and deterministic rule-based inference for non-SKU rows.

This allows all products ‚Äî regardless of how they were purchased or decoded ‚Äî to be expressed in a **common unit space**.

---

### Why `matrix_qty` is necessary

For consumable products, customer behaviour is driven by **usage**, not by calendar time alone.

Two customers may repurchase after the same number of days, but if one acquired more units, their **true replacement cycle per unit** is different.

`matrix_qty` provides the quantity context required to:
- convert calendar time into **per-unit consumption time**,
- estimate realistic replacement cycles,
- and build meaningful retention and LTV features.

---

### What `matrix_qty` is *not*

It is important to distinguish what `matrix_qty` does **not** represent:

- It is **not** an event counter
- It is **not** a per-purchase increment by default
- It is **not** directly suitable as a divisor in retention calculations

After aggregation into a `customer √ó day √ó product-group` timeline,  
`matrix_qty` should be interpreted as a **state-like daily total**, not as an event delta.

---

### Implication for retention calculations

Retention analysis requires **event-level quantities** (i.e. how many units were acquired at a given purchase).

Because `matrix_qty` represents a **position**, event-level quantities must be **explicitly derived later** from changes in `matrix_qty` over time before computing adjusted retention metrics.

This separation ensures that:
- retention intervals are not artificially compressed,
- quantity effects are handled correctly,
- and downstream models reflect true customer consumption behaviour.


## Output: `cust_day_group`

This notebook produces `cust_day_group.parquet`, the canonical daily purchase table
used as input for retention and LTV modeling.

Grain:
- one row per (anon, date, MATRIX GRUPA PRODUKTOWA)

Key fields:
- `matrix_qty` ‚Äî economically meaningful quantity
  - SKU rows: `qty √ó ILO≈öƒÜ FILTR√ìW`
  - Non-SKU filter-only rows: backfilled conservatively
- `matrix_source` ‚Äî SKU / backfill provenance
- `is_retention_relevant` ‚Äî whether the row should be used for replacement modeling

All decoding, SKU logic, and safety assumptions stop here.
Downstream notebooks must not reinterpret product semantics.
