In [18]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path("../data/interim")


# entry ecosystem distribution (E1)

entry_ecosystem_dist_df = pd.read_parquet(
    DATA_DIR / "ecosystem_entry_distribution.parquet"
)

# convert to Series (this is what the simulator will consume)
entry_ecosystem_dist = entry_ecosystem_dist_df["p"]

# sanity checks
assert entry_ecosystem_dist.index.name == "ecosystem"
assert np.isclose(entry_ecosystem_dist.sum(), 1.0)

entry_ecosystem_dist




ecosystem
bottle          0.515466
pitcher         0.364400
PushAir         0.035307
other           0.024799
sink            0.021369
container       0.015487
flow_comfort    0.013962
CO2             0.006463
keton           0.002059
Proskin         0.000689
Name: p, dtype: float64

### Entry ecosystem distribution (E₁)

We load the empirical distribution of ecosystems at first purchase.

Object:
- `entry_ecosystem_dist`: pd.Series
  - index: ecosystem
  - values: probability
  - ∑ p = 1

This object is treated as fixed and will not be recomputed.


In [23]:
P_product_group_given_ecosystem_k = pd.read_parquet(
    DATA_DIR / "P_product_group_given_ecosystem_k.parquet"
)

P_product_group_given_ecosystem_k.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263 entries, 0 to 1262
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ecosystem      1263 non-null   object 
 1   purchase_k     1263 non-null   int64  
 2   product_group  1263 non-null   object 
 3   n              1263 non-null   int64  
 4   p              1263 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 49.5+ KB


In [29]:
# steps present
P_product_group_given_ecosystem_k["purchase_k"].value_counts().sort_index()

# example: inspect k = 1 only
(
    P_product_group_given_ecosystem_k
        .query("purchase_k == 1")
        .sort_values("p", ascending=False)
        .head(10)
)




Unnamed: 0,ecosystem,purchase_k,product_group,n,p
127,Proskin,1,27_Proskin - urządzenia i filtry,65,0.698925
1130,sink,1,24_Filtry przepływowe - wkłady,1720,0.598261
0,CO2,1,32_nabój CO2 - wymiana,720,0.540947
779,keton,1,11_czajnik filtrujący,173,0.475275
504,container,1,35_pojemniki SeeYou,1092,0.437675
842,other,1,38_inne,2535,0.354149
339,bottle,1,06_filtry do butelek Soft i Solid,20316,0.342892
340,bottle,1,03_butelki filtrujące SOLID,16811,0.283735
965,pitcher,1,13_filtry do dzbanków standard,10290,0.276754
646,flow_comfort,1,21_wkład D1 do Flow Comfort,600,0.268336


### Product group choice within step

We load the empirical conditional distribution:

P(Gₖ | Eₖ, k)

Object:
- `P_product_group_given_ecosystem_k`: pd.DataFrame
  - columns: k, ecosystem (E), product_group (G), p

This table is treated as fixed.
No smoothing, no re-normalisation, no inference is done in this notebook.


In [31]:
T_ek = {}

for k in [1, 2, 3, 4]:
    T_ek[k] = pd.read_parquet(
        DATA_DIR / f"ecosystem_transitions/ecosystem_transition_{k}_to_{k+1}.parquet"
    )

    print(f"k={k} shape:", T_ek[k].shape)


k=1 shape: (10, 10)
k=2 shape: (10, 10)
k=3 shape: (10, 10)
k=4 shape: (10, 10)


### Ecosystem transition matrices

We load step-wise ecosystem transitions:

P(Eₖ₊₁ | Eₖ, k),  for k = 1..4

Objects:
- `T_ek[k]`: pd.DataFrame
  - index: Eₖ
  - columns: Eₖ₊₁
  - rows sum to 1

These matrices define the structural ecosystem evolution
and are treated as fixed.


### Simulation assumptions (structural)

- Horizon: k = 1..5
- State per step: (Eₖ, Gₖ)
- E₁ ~ empirical entry ecosystem distribution
- Gₖ ~ P(Gₖ | Eₖ, k)
- Eₖ₊₁ ~ P(Eₖ₊₁ | Eₖ, k)
- If a conditional row is missing:
  fallback to the marginal distribution for that step

### State definition

At each purchase step k, the customer state is:

- Eₖ: ecosystem
- Gₖ: product group

The joint state (Eₖ, Gₖ) fully describes the structural purchase outcome
at step k.



In [35]:
rng = np.random.default_rng(42)

def sample_from_series(p: pd.Series) -> str:
    """Sample from a probability Series (index=labels)."""
    p = p.fillna(0.0)
    p = p / p.sum()
    return rng.choice(p.index, p=p.values)


In [38]:
P_product_group_given_ecosystem_k.head()

Unnamed: 0,ecosystem,purchase_k,product_group,n,p
0,CO2,1,32_nabój CO2 - wymiana,720,0.540947
1,CO2,1,31_nabój CO2 - zakup,204,0.153268
2,CO2,1,34_suplementy PushAir,81,0.060856
3,CO2,1,30_saturator PushAir,79,0.059354
4,CO2,1,33_butelki do saturatora PushAir,78,0.058603


In [39]:
def simulate_one_customer(customer_id: int) -> dict:
    row = {"anon_sim": customer_id}

    # Step 1: ecosystem
    E = sample_from_series(entry_ecosystem_dist)
    row["E1"] = E

    for k in range(1, 6):
        # ---- G_k | E_k ----
        Pg = P_product_group_given_ecosystem_k.query("purchase_k == @k")

        if E in Pg["ecosystem"].values:
            p_g = (
                Pg[Pg["ecosystem"] == E]
                .set_index("product_group")["p"]
            )
        else:
            # fallback: marginal at step k
            p_g = (
                Pg.groupby("product_group")["p"]
                .sum()
            )

        G = sample_from_series(p_g)
        row[f"G{k}"] = G

        # ---- E_{k+1} | E_k ----
        if k < 5:
            T = T_ek[k]

            if E in T.index:
                p_e_next = T.loc[E]
            else:
                p_e_next = T.sum(axis=0)

            E = sample_from_series(p_e_next)
            row[f"E{k+1}"] = E

    return row


In [40]:
sim_test = pd.DataFrame(
    simulate_one_customer(i) for i in range(5)
)

sim_test


Unnamed: 0,anon_sim,E1,G1,E2,G2,E3,G3,E4,G4,E5,G5
0,0,bottle,38_inne,bottle,06_filtry do butelek Soft i Solid,pitcher,08_dzbanki filtrujące manualne,pitcher,13_filtry do dzbanków standard,pitcher,16_filtry do dzbanków AGD+
1,1,other,06_filtry do butelek Soft i Solid,pitcher,08_dzbanki filtrujące manualne,bottle,03_butelki filtrujące SOLID,bottle,05_butelki i kubki termiczne,bottle,02_butelki filtrujące SOFT
2,2,bottle,30_saturator PushAir,pitcher,14_filtry do dzbanków Mg+,bottle,03_butelki filtrujące SOLID,bottle,06_filtry do butelek Soft i Solid,container,03_butelki filtrujące SOLID
3,3,container,35_pojemniki SeeYou,container,36_akcesoria SeeYou,bottle,06_filtry do butelek Soft i Solid,bottle,06_filtry do butelek Soft i Solid,bottle,03_butelki filtrujące SOLID
4,4,pitcher,16_filtry do dzbanków AGD+,bottle,13_filtry do dzbanków standard,other,06_filtry do butelek Soft i Solid,bottle,07_akcesoria do Soft/Solid,bottle,06_filtry do butelek Soft i Solid


### Real path representation

For validation, real customer histories are represented as:

(E₁, G₁), (E₂, G₂), …, (E₅, G₅)

Where:
- k = purchase index (chronological, distinct orders)
- Eₖ = ecosystem active at purchase k
- Gₖ = product group purchased at purchase k

Only customers with at least 5 purchases are retained.


In [69]:
real_paths = pd.read_parquet(
    DATA_DIR / "order_pg_ecosystem_sets_3y_k5.parquet"
)

real_paths.head()


Unnamed: 0,anon,purchase_k,product_groups,ecosystems,date,n_ecosystems,n_product_groups,n_purchases_in_horizon
0,ANON_0000011,1,[10_dzbanki filtrujące Crystal],[pitcher],2022-10-11,1,1,2
1,ANON_0000011,2,"[08_dzbanki filtrujące manualne, 38_inne]","[other, pitcher]",2024-02-19,2,2,2
2,ANON_0000012,1,[10_dzbanki filtrujące Crystal],[pitcher],2022-10-11,1,1,2
3,ANON_0000012,2,"[06_filtry do butelek Soft i Solid, 14_filtry ...","[bottle, pitcher]",2024-09-26,2,2,2
4,ANON_0000019,1,"[02_butelki filtrujące SOFT, 03_butelki filtru...",[bottle],2022-10-11,1,3,4


### Real path construction

Real purchase paths are constructed from
`order_pg_ecosystem_sets_3y_k5.parquet`.

If an order contains multiple ecosystems or product groups:
- Eₖ is defined as the first ecosystem in the order
- Gₖ is defined as the first product group in the order

This deterministic collapse ensures structural comparability
with the simulator state (Eₖ, Gₖ).


In [64]:
real_paths.dtypes

anon                              object
purchase_k                         int64
product_groups                    object
ecosystems                        object
date                      datetime64[ns]
n_ecosystems                       int64
n_product_groups                   int64
n_purchases_in_horizon             int64
dtype: object

In [70]:
x = real_paths["ecosystems"].iloc[0]
x, type(x)


(array(['pitcher'], dtype=object), numpy.ndarray)

In [71]:
def first_elem(x):
    if isinstance(x, np.ndarray) and x.size > 0:
        return x[0]
    return np.nan


In [72]:
real_paths = (
    real_paths
        .assign(
            E=lambda df: df["ecosystems"].apply(first_elem),
            G=lambda df: df["product_groups"].apply(first_elem),
        )
        .loc[:, ["anon", "purchase_k", "E", "G"]]
        .query("purchase_k <= 5")
        .sort_values(["anon", "purchase_k"])
)

real_paths.head(10)


Unnamed: 0,anon,purchase_k,E,G
0,ANON_0000011,1,pitcher,10_dzbanki filtrujące Crystal
1,ANON_0000011,2,other,08_dzbanki filtrujące manualne
2,ANON_0000012,1,pitcher,10_dzbanki filtrujące Crystal
3,ANON_0000012,2,bottle,06_filtry do butelek Soft i Solid
4,ANON_0000019,1,bottle,02_butelki filtrujące SOFT
5,ANON_0000019,2,bottle,06_filtry do butelek Soft i Solid
6,ANON_0000019,3,bottle,02_butelki filtrujące SOFT
7,ANON_0000019,4,bottle,06_filtry do butelek Soft i Solid
8,ANON_0000027,1,bottle,02_butelki filtrujące SOFT
9,ANON_0000027,2,pitcher,08_dzbanki filtrujące manualne


In [73]:
eligible_customers = (
    real_paths
        .groupby("anon")["purchase_k"]
        .max()
        .loc[lambda s: s == 5]
        .index
)

real_paths_5 = real_paths[
    real_paths["anon"].isin(eligible_customers)
]

real_paths_5["anon"].nunique()


2243

In [74]:
real_wide = (
    real_paths_5
        .pivot(index="anon", columns="purchase_k")
)

real_wide.columns = [
    f"{col}{k}" for col, k in real_wide.columns
]

real_wide = real_wide.reset_index()

real_wide.head()


Unnamed: 0,anon,E1,E2,E3,E4,E5,G1,G2,G3,G4,G5
0,ANON_0000064,pitcher,pitcher,pitcher,pitcher,pitcher,13_filtry do dzbanków standard,13_filtry do dzbanków standard,13_filtry do dzbanków standard,13_filtry do dzbanków standard,08_dzbanki filtrujące manualne
1,ANON_0000126,bottle,bottle,bottle,bottle,bottle,06_filtry do butelek Soft i Solid,03_butelki filtrujące SOLID,03_butelki filtrujące SOLID,06_filtry do butelek Soft i Solid,03_butelki filtrujące SOLID
2,ANON_0000157,bottle,pitcher,pitcher,pitcher,pitcher,06_filtry do butelek Soft i Solid,10_dzbanki filtrujące Crystal,10_dzbanki filtrujące Crystal,13_filtry do dzbanków standard,13_filtry do dzbanków standard
3,ANON_0000171,bottle,bottle,bottle,bottle,bottle,07_akcesoria do Soft/Solid,03_butelki filtrujące SOLID,02_butelki filtrujące SOFT,03_butelki filtrujące SOLID,07_akcesoria do Soft/Solid
4,ANON_0000235,bottle,bottle,bottle,bottle,bottle,07_akcesoria do Soft/Solid,03_butelki filtrujące SOLID,02_butelki filtrujące SOFT,02_butelki filtrujące SOFT,03_butelki filtrujące SOLID


### Real paths (final)

We construct real customer paths from
`order_pg_ecosystem_sets_3y_k5.parquet`.

- One row per customer
- Exactly 5 purchases (k = 1..5)
- State per step: (Eₖ, Gₖ)
- Same schema as simulator output

This dataset is used exclusively for validation.


In [75]:
sim_cohort = pd.DataFrame(
    simulate_one_customer(i) for i in range(10_000)
)


In [76]:
def step_marginal(df, col_prefix, k):
    return (
        df[f"{col_prefix}{k}"]
        .value_counts(normalize=True)
        .sort_index()
    )


In [77]:
ecosystem_marginals = {}

for k in range(1, 6):
    ecosystem_marginals[k] = pd.DataFrame({
        "real": step_marginal(real_wide, "E", k),
        "sim": step_marginal(sim_cohort, "E", k),
    }).fillna(0.0)

ecosystem_marginals[1]


Unnamed: 0_level_0,real,sim
E1,Unnamed: 1_level_1,Unnamed: 2_level_1
CO2,0.061971,0.007
Proskin,0.0,0.0009
PushAir,0.033437,0.0333
bottle,0.54436,0.5112
container,0.017833,0.0166
flow_comfort,0.046366,0.0145
keton,0.001337,0.0023
other,0.007579,0.0263
pitcher,0.236291,0.365
sink,0.050825,0.0229


In [78]:
product_group_marginals = {}

for k in range(1, 6):
    product_group_marginals[k] = pd.DataFrame({
        "real": step_marginal(real_wide, "G", k),
        "sim": step_marginal(sim_cohort, "G", k),
    }).fillna(0.0)

product_group_marginals[1].head(10)


Unnamed: 0_level_0,real,sim
G1,Unnamed: 1_level_1,Unnamed: 2_level_1
01_bidony,0.024075,0.0147
02_butelki filtrujące SOFT,0.081141,0.0432
03_butelki filtrujące SOLID,0.221578,0.1735
04_termiczna butelka filtrująca SOLID,0.001783,0.0031
05_butelki i kubki termiczne,0.015158,0.0186
06_filtry do butelek Soft i Solid,0.176549,0.2123
07_akcesoria do Soft/Solid,0.044137,0.055
08_dzbanki filtrujące manualne,0.027196,0.0614
09_dzbanki filtrujące LED,0.025412,0.0202
10_dzbanki filtrujące Crystal,0.052162,0.0723


In [79]:
def l1_distance(df):
    return (df["real"] - df["sim"]).abs().sum()

ecosystem_l1 = {
    k: l1_distance(ecosystem_marginals[k])
    for k in range(1, 6)
}

product_group_l1 = {
    k: l1_distance(product_group_marginals[k])
    for k in range(1, 6)
}

ecosystem_l1, product_group_l1


({1: 0.29858537672759694,
  2: 0.31226901471243873,
  3: 0.28352741863575565,
  4: 0.29568568880962987,
  5: 0.3141357111012037},
 {1: 0.4606534106107891,
  2: 0.4847289344627731,
  3: 0.42559554168524294,
  4: 0.3823024520731164,
  5: 0.40499358002674984})

### Interpretation of marginal diagnostics

Observed L1 distances (0.3–0.5) indicate structural mismatch.

This is expected at this stage because:
- simulation conditions on entry, not survival
- product groups do not influence ecosystem transitions
- multi-item orders are collapsed to scalars

These diagnostics are used to localize missing structure,
not to tune or optimize the simulator.


In [81]:
# real entry among survivors
real_entry_survivors = (
    real_wide["E1"]
    .value_counts(normalize=True)
    .sort_index()
)

# simulated entry
sim_entry = (
    sim_cohort["E1"]
    .value_counts(normalize=True)
    .sort_index()
)

entry_compare = pd.DataFrame({
    "real_survivors": real_entry_survivors,
    "sim_entry": sim_entry,
}).fillna(0.0)

entry_compare


Unnamed: 0_level_0,real_survivors,sim_entry
E1,Unnamed: 1_level_1,Unnamed: 2_level_1
CO2,0.061971,0.007
Proskin,0.0,0.0009
PushAir,0.033437,0.0333
bottle,0.54436,0.5112
container,0.017833,0.0166
flow_comfort,0.046366,0.0145
keton,0.001337,0.0023
other,0.007579,0.0263
pitcher,0.236291,0.365
sink,0.050825,0.0229


In [82]:
entry_l1 = (entry_compare["real_survivors"] - entry_compare["sim_entry"]).abs().sum()
entry_l1


0.29858537672759694

In [83]:
cond_pg_E1 = {}

for e in real_wide["E1"].unique():
    real_cond = (
        real_wide
            .query("E1 == @e")["G1"]
            .value_counts(normalize=True)
    )

    sim_cond = (
        sim_cohort
            .query("E1 == @e")["G1"]
            .value_counts(normalize=True)
    )

    cond_pg_E1[e] = (
        pd.DataFrame({"real": real_cond, "sim": sim_cond})
          .fillna(0.0)
    )


In [84]:
cond_pg_E1["bottle"].head(10)


Unnamed: 0_level_0,real,sim
G1,Unnamed: 1_level_1,Unnamed: 2_level_1
01_bidony,0.01638,0.019562
02_butelki filtrujące SOFT,0.14742,0.068075
03_butelki filtrujące SOLID,0.402948,0.286189
04_termiczna butelka filtrująca SOLID,0.002457,0.004304
05_butelki i kubki termiczne,0.026208,0.021909
06_filtry do butelek Soft i Solid,0.323505,0.346244
07_akcesoria do Soft/Solid,0.081081,0.092332
08_dzbanki filtrujące manualne,0.0,0.016041
09_dzbanki filtrujące LED,0.0,0.00626
10_dzbanki filtrujące Crystal,0.0,0.017997


In [85]:
(cond_pg_E1["bottle"]["real"] - cond_pg_E1["bottle"]["sim"]).abs().sum()


0.4008057353127776

In [86]:
transition_real = (
    real_wide
        .groupby(["E1", "E2"])
        .size()
        .groupby(level=0)
        .apply(lambda s: s / s.sum())
)

transition_sim = (
    sim_cohort
        .groupby(["E1", "E2"])
        .size()
        .groupby(level=0)
        .apply(lambda s: s / s.sum())
)


In [87]:
e = "bottle"

transition_compare = pd.DataFrame({
    "real": transition_real.loc[e],
    "sim": transition_sim.loc[e],
}).fillna(0.0)

transition_compare


Unnamed: 0_level_0,Unnamed: 1_level_0,real,sim
E1,E2,Unnamed: 2_level_1,Unnamed: 3_level_1
bottle,CO2,0.013104,0.006651
bottle,Proskin,0.0,0.000391
bottle,PushAir,0.013104,0.026995
bottle,bottle,0.864046,0.721831
bottle,container,0.005733,0.010955
bottle,flow_comfort,0.011466,0.006064
bottle,keton,0.000819,0.001174
bottle,other,0.005733,0.039319
bottle,pitcher,0.079443,0.179969
bottle,sink,0.006552,0.006651


In [88]:
(transition_compare["real"] - transition_compare["sim"]).abs().sum()


0.30813944546338906

In [89]:
def l1(p, q):
    idx = p.index.union(q.index)
    p = p.reindex(idx, fill_value=0.0)
    q = q.reindex(idx, fill_value=0.0)
    return (p - q).abs().sum()

def marginals_table(real_df, sim_df, prefix, K=5, top_n=8):
    rows = []
    for k in range(1, K+1):
        r = real_df[f"{prefix}{k}"].value_counts(normalize=True)
        s = sim_df[f"{prefix}{k}"].value_counts(normalize=True)
        rows.append({
            "k": k,
            "L1": l1(r, s),
            "real_top": ", ".join(r.sort_values(ascending=False).head(top_n).index.astype(str)),
            "sim_top": ", ".join(s.sort_values(ascending=False).head(top_n).index.astype(str)),
        })
    return pd.DataFrame(rows)

# simulate a cohort big enough for stable numbers
sim_cohort = pd.DataFrame(simulate_one_customer(i) for i in range(20_000))

ecos_diag = marginals_table(real_wide, sim_cohort, "E", K=5)
pg_diag   = marginals_table(real_wide, sim_cohort, "G", K=5)

ecos_diag, pg_diag


(   k        L1                                           real_top  \
 0  1  0.301211  bottle, pitcher, CO2, sink, flow_comfort, Push...   
 1  2  0.314311  bottle, pitcher, CO2, sink, flow_comfort, Push...   
 2  3  0.271927  bottle, pitcher, CO2, sink, flow_comfort, Push...   
 3  4  0.295786  bottle, pitcher, CO2, flow_comfort, sink, Push...   
 4  5  0.286236  bottle, pitcher, CO2, sink, flow_comfort, Push...   
 
                                              sim_top  
 0  bottle, pitcher, PushAir, other, sink, contain...  
 1  bottle, pitcher, PushAir, other, sink, contain...  
 2  bottle, pitcher, other, PushAir, CO2, sink, co...  
 3  bottle, pitcher, CO2, PushAir, sink, other, fl...  
 4  bottle, pitcher, CO2, sink, PushAir, flow_comf...  ,
    k        L1                                           real_top  \
 0  1  0.468153  03_butelki filtrujące SOLID, 06_filtry do bute...   
 1  2  0.463329  06_filtry do butelek Soft i Solid, 03_butelki ...   
 2  3  0.427496  06_filtry do b

In [90]:
def transition_table(real_df, sim_df, k_from=1, k_to=2, top_from=5):
    a, b = f"E{k_from}", f"E{k_to}"

    real_from = real_df[a].value_counts(normalize=True).head(top_from).index
    out = {}

    for e in real_from:
        r = (real_df[real_df[a]==e][b].value_counts(normalize=True))
        s = (sim_df[sim_df[a]==e][b].value_counts(normalize=True))

        comp = pd.DataFrame({"real": r, "sim": s}).fillna(0.0)
        comp["abs_diff"] = (comp["real"] - comp["sim"]).abs()
        out[e] = comp.sort_values("abs_diff", ascending=False).head(8)

    return out

trans_12 = transition_table(real_wide, sim_cohort, 1, 2, top_from=5)
trans_12


{'bottle':                   real       sim  abs_diff
 E2                                        
 bottle        0.864046  0.723774  0.140272
 pitcher       0.079443  0.177332  0.097889
 other         0.005733  0.040140  0.034407
 PushAir       0.013104  0.022785  0.009681
 container     0.005733  0.013380  0.007647
 flow_comfort  0.011466  0.006108  0.005358
 CO2           0.013104  0.009308  0.003796
 sink          0.006552  0.005526  0.001026,
 'pitcher':                   real       sim  abs_diff
 E2                                        
 pitcher       0.711321  0.624915  0.086406
 bottle        0.177358  0.262451  0.085092
 PushAir       0.009434  0.030126  0.020692
 sink          0.022642  0.010042  0.012599
 other         0.039623  0.036233  0.003390
 flow_comfort  0.007547  0.004614  0.002933
 container     0.018868  0.018184  0.000684
 Proskin       0.000000  0.000543  0.000543,
 'CO2':                   real       sim  abs_diff
 E2                                        
 C

## Model v0 — Structural Simulator (No Revenue)

### What it does
- Simulates customer purchase paths over 5 purchases.
- Each purchase k is represented by:
  - Eₖ: ecosystem
  - Gₖ: product group
- Uses empirical components:
  - Entry distribution for E₁
  - P(Gₖ | Eₖ, k)
  - Markov transitions P(Eₖ₊₁ | Eₖ, k)

### What we validated
We compared simulated vs real customer paths (restricted to customers who reach 5 purchases):
- Marginal distributions of Eₖ and Gₖ for k=1..5
- Transition distributions (e.g., E₁ → E₂) for major ecosystems

### Key findings
- The simulator reproduces the dominant modes (major ecosystems and product groups), but shows systematic mismatch.
- The largest mismatch drivers are structural:
  1) Survivorship bias: real cohort is conditioned on reaching 5 purchases.
  2) Product-group persistence: real customers show “lock-in” to consumables, which is not yet modeled.
  3) Coupling: ecosystem transitions depend on what was bought (Gₖ), not only current ecosystem (Eₖ).

### Next upgrade (v1)
We will implement one structural enhancement (choose one):
- Survivor-conditioned entry distribution, or
- Product-group persistence (Gₖ → Gₖ₊₁), or
- Coupled transitions P(Eₖ₊₁ | Eₖ, Gₖ)

This will materially improve alignment before adding economics.


In [91]:
real_wide.to_parquet(
    DATA_DIR / "real_paths_k5_EG.parquet",
    index=False
)

sim_cohort.to_parquet(
    DATA_DIR / "sim_paths_v0_k5.parquet",
    index=False
)
