In [2]:
# --- imports ---
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd

INTERIM = Path("../data/interim")

T_PATHS = {
    (1, 2): INTERIM / "ecosystem_transitions" / "ecosystem_transition_1_to_2.parquet",
    (2, 3): INTERIM / "ecosystem_transitions" / "ecosystem_transition_2_to_3.parquet",
    (3, 4): INTERIM / "ecosystem_transitions" / "ecosystem_transition_3_to_4.parquet",
    (4, 5): INTERIM / "ecosystem_transitions" / "ecosystem_transition_4_to_5.parquet",
}

T_e = {k: pd.read_parquet(p) for k, p in T_PATHS.items()}

# quick check (shapes only, no prints)
{k: T_e[k].shape for k in T_e}



{(1, 2): (10, 10), (2, 3): (10, 10), (3, 4): (10, 10), (4, 5): (10, 10)}

In [3]:


entry_counts = pd.read_parquet(INTERIM / "entry_counts.parquet")

entry_counts.columns


Index(['product_group', 'count', 'pct_of_customers'], dtype='object')

In [4]:
eco_map = pd.read_csv("../data/reference/products_ecosystem.csv")

eco_map.columns


Index(['product_group', 'ecosystem'], dtype='object')

In [5]:
entry_counts = pd.read_parquet(INTERIM / "entry_counts.parquet")

entry_with_eco = (
    entry_counts
    .merge(eco_map, on="product_group", how="left")
)

# safety check
entry_with_eco["ecosystem"].isna().sum()


0

In [6]:
entry_ecosystem_dist = (
    entry_with_eco
    .groupby("ecosystem", as_index=True)["count"]
    .sum()
    .pipe(lambda s: s / s.sum())
    .to_frame("p")
    .sort_values("p", ascending=False)
)

entry_ecosystem_dist

entry_ecosystem_dist.to_parquet(
    INTERIM / "ecosystem_entry_distribution.parquet"
)



In [7]:
import numpy as np
import pandas as pd
from pathlib import Path

INTERIM = Path("../data/interim")

P_G_EK = pd.read_parquet(INTERIM / "P_product_group_given_ecosystem_k.parquet")

P_G_EK.head()


Unnamed: 0,ecosystem,purchase_k,product_group,n,p
0,CO2,1,32_nab√≥j CO2 - wymiana,720,0.540947
1,CO2,1,31_nab√≥j CO2 - zakup,204,0.153268
2,CO2,1,34_suplementy PushAir,81,0.060856
3,CO2,1,30_saturator PushAir,79,0.059354
4,CO2,1,33_butelki do saturatora PushAir,78,0.058603


In [8]:
def sample_product_group(rng: np.random.Generator, ecosystem: str, k: int, P_G_EK: pd.DataFrame) -> str:
    sub = P_G_EK[(P_G_EK["ecosystem"] == ecosystem) & (P_G_EK["purchase_k"] == k)][["product_group", "p"]]
    if sub.empty:
        raise KeyError(f"No P(G|E,k) rows for ecosystem={ecosystem!r}, k={k}")

    labels = sub["product_group"].astype(str).to_numpy()
    probs = sub["p"].astype(float).to_numpy()
    probs = probs / probs.sum()  # safety normalize

    return str(rng.choice(labels, p=probs))


In [9]:
rng = np.random.default_rng(42)
sample_product_group(rng, ecosystem=P_G_EK["ecosystem"].iloc[0], k=int(P_G_EK["purchase_k"].iloc[0]), P_G_EK=P_G_EK)


'30_saturator PushAir'

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

INTERIM = Path("../data/interim")

T_e = {
    1: pd.read_parquet(INTERIM / "ecosystem_transitions" / "ecosystem_transition_1_to_2.parquet"),
    2: pd.read_parquet(INTERIM / "ecosystem_transitions" / "ecosystem_transition_2_to_3.parquet"),
    3: pd.read_parquet(INTERIM / "ecosystem_transitions" / "ecosystem_transition_3_to_4.parquet"),
    4: pd.read_parquet(INTERIM / "ecosystem_transitions" / "ecosystem_transition_4_to_5.parquet"),
}

{k: T_e[k].shape for k in T_e}


{1: (10, 10), 2: (10, 10), 3: (10, 10), 4: (10, 10)}

In [11]:
def sample_next_ecosystem(rng: np.random.Generator, ecosystem: str, k: int, T_e: dict[int, pd.DataFrame]) -> str:
    if k not in T_e:
        raise KeyError(f"No transition matrix for k={k} (need k in {list(T_e.keys())})")

    T = T_e[k]

    if ecosystem not in T.index:
        raise KeyError(f"Ecosystem {ecosystem!r} not in transition matrix index for k={k}")

    row = T.loc[ecosystem].astype(float)
    probs = row.to_numpy()
    probs = probs / probs.sum()  # safety normalize

    return str(rng.choice(T.columns.astype(str).to_numpy(), p=probs))


In [12]:
rng = np.random.default_rng(42)
e0 = T_e[1].index[0]
sample_next_ecosystem(rng, ecosystem=str(e0), k=1, T_e=T_e)


'bottle'

In [13]:
from dataclasses import dataclass
import numpy as np
import pandas as pd

# expects: entry_ecosystem_dist with index 'ecosystem' and column 'p'
# expects: P_G_EK as your parquet (ecosystem, purchase_k, product_group, p)
# expects: T_e dict: {1: T12, 2: T23, 3: T34, 4: T45}

@dataclass(frozen=True)
class SimStep:
    k: int
    ecosystem: str
    product_group: str

def _sample_from_probs(rng: np.random.Generator, labels: np.ndarray, probs: np.ndarray) -> str:
    probs = probs.astype(float)
    probs = probs / probs.sum()
    return str(rng.choice(labels.astype(str), p=probs))

def sample_entry_ecosystem(rng: np.random.Generator, entry_ecosystem_dist: pd.DataFrame) -> str:
    s = entry_ecosystem_dist["p"]
    return _sample_from_probs(rng, s.index.to_numpy(), s.to_numpy())

def sample_product_group(rng: np.random.Generator, ecosystem: str, k: int, P_G_EK: pd.DataFrame) -> str:
    sub = P_G_EK[(P_G_EK["ecosystem"] == ecosystem) & (P_G_EK["purchase_k"] == k)]
    if sub.empty:
        raise KeyError(f"No P(G|E,k) for ecosystem={ecosystem!r}, k={k}")
    return _sample_from_probs(rng, sub["product_group"].to_numpy(), sub["p"].to_numpy())

def sample_next_ecosystem(rng: np.random.Generator, ecosystem: str, k: int, T_e: dict[int, pd.DataFrame]) -> str:
    if k not in T_e:
        raise KeyError(f"No transition matrix for k={k}")
    T = T_e[k]
    if ecosystem not in T.index:
        raise KeyError(f"Ecosystem {ecosystem!r} not in T_e[{k}] index")
    row = T.loc[ecosystem].astype(float)
    return _sample_from_probs(rng, T.columns.to_numpy(), row.to_numpy())


In [14]:
rng = np.random.default_rng(42)

E1 = sample_entry_ecosystem(rng, entry_ecosystem_dist)
G1 = sample_product_group(rng, E1, 1, P_G_EK)
E2 = sample_next_ecosystem(rng, E1, 1, T_e)

(E1, G1, E2)


('pitcher', '08_dzbanki filtrujƒÖce manualne', 'pitcher')

In [15]:
from typing import Tuple, Optional

def step_forward(
    rng: np.random.Generator,
    k: int,
    ecosystem_k: str,
    P_G_EK: pd.DataFrame,
    T_e: dict[int, pd.DataFrame],
) -> Tuple[str, Optional[str]]:
    """
    Returns:
        G_k       : sampled product group at step k
        E_k_plus1 : sampled next ecosystem (None if k == 5)
    """

    # 1) sample product group
    G_k = sample_product_group(
        rng=rng,
        ecosystem=ecosystem_k,
        k=k,
        P_G_EK=P_G_EK,
    )

    # 2) sample next ecosystem if applicable
    if k < 5:
        E_k_plus1 = sample_next_ecosystem(
            rng=rng,
            ecosystem=ecosystem_k,
            k=k,
            T_e=T_e,
        )
    else:
        E_k_plus1 = None

    return G_k, E_k_plus1


In [16]:
rng = np.random.default_rng(42)

E1 = sample_entry_ecosystem(rng, entry_ecosystem_dist)
G1, E2 = step_forward(
    rng=rng,
    k=1,
    ecosystem_k=E1,
    P_G_EK=P_G_EK,
    T_e=T_e,
)

(E1, G1, E2)


('pitcher', '08_dzbanki filtrujƒÖce manualne', 'pitcher')

In [17]:
from typing import List

def simulate_path(
    rng: np.random.Generator,
    entry_ecosystem_dist: pd.DataFrame,
    P_G_EK: pd.DataFrame,
    T_e: dict[int, pd.DataFrame],
    K: int = 5,
) -> List[SimStep]:
    """
    Simulates one customer path:
    [(E1,G1), (E2,G2), ..., (EK,GK)]
    """

    path: List[SimStep] = []

    # entry
    E_k = sample_entry_ecosystem(rng, entry_ecosystem_dist)

    for k in range(1, K + 1):
        G_k, E_k_plus1 = step_forward(
            rng=rng,
            k=k,
            ecosystem_k=E_k,
            P_G_EK=P_G_EK,
            T_e=T_e,
        )

        path.append(
            SimStep(
                k=k,
                ecosystem=E_k,
                product_group=G_k,
            )
        )

        if E_k_plus1 is None:
            break

        E_k = E_k_plus1

    return path


In [18]:
rng = np.random.default_rng(42)

simulate_path(
    rng=rng,
    entry_ecosystem_dist=entry_ecosystem_dist,
    P_G_EK=P_G_EK,
    T_e=T_e,
)


[SimStep(k=1, ecosystem='pitcher', product_group='08_dzbanki filtrujƒÖce manualne'),
 SimStep(k=2, ecosystem='pitcher', product_group='06_filtry do butelek Soft i Solid'),
 SimStep(k=3, ecosystem='bottle', product_group='35_pojemniki SeeYou'),
 SimStep(k=4, ecosystem='other', product_group='13_filtry do dzbank√≥w standard'),
 SimStep(k=5, ecosystem='bottle', product_group='03_butelki filtrujƒÖce SOLID')]

## ‚úÖ Notebook Summary ‚Äî Structural Forward Simulator

In this notebook we constructed a **purely structural forward simulator** of customer purchase paths over a fixed 5-purchase horizon.

### What this simulator does
For each synthetic customer, the simulator generates a sequence:

(E‚ÇÅ, G‚ÇÅ), (E‚ÇÇ, G‚ÇÇ), ‚Ä¶, (E‚ÇÖ, G‚ÇÖ)

where:
- **E‚Çñ** is the ecosystem at purchase step *k*
- **G‚Çñ** is the product group purchased at step *k*

The simulation is **empirical and generative**, built entirely from observed transition frequencies.

---

## üîí Locked modeling assumptions

- **Horizon:** 5 purchases (`k = 1..5`)
- **Entry state:**  
  E‚ÇÅ ~ P(E‚ÇÅ) from the observed entry ecosystem distribution
- **Within-step choice:**  
  G‚Çñ ~ P(G‚Çñ | E‚Çñ, k)
- **Ecosystem transitions:**  
  E‚Çñ‚Çä‚ÇÅ ~ P(E‚Çñ‚Çä‚ÇÅ | E‚Çñ, k)
- **Structural only:**  
  - no revenue  
  - no optimisation  
  - no ML / fitting  
  - no policy changes  

All probabilities are taken **directly from historical data**.

---

## üì¶ Inputs used (existing artifacts)

- `entry_ecosystem_dist`  
  ‚Üí empirical distribution of entry ecosystems
- `P_product_group_given_ecosystem_k.parquet`  
  ‚Üí P(G‚Çñ | E‚Çñ, k)
- `ecosystem_transition_{k}_to_{k+1}.parquet`  
  ‚Üí P(E‚Çñ‚Çä‚ÇÅ | E‚Çñ, k)

No distributions were re-estimated in this notebook.

---

## üß† What this enables next

This simulator provides a **stable backbone** for:
- validation against observed paths
- counterfactual ecosystem dynamics
- extension to a 6+ tail
- layering revenue, pricing, or LTV on top
- stress-testing ecosystem strategies

All future extensions can be built **without modifying the structural core** defined here.

---

## ‚ñ∂Ô∏è Next notebook

**Notebook 09 ‚Äî Simulation Diagnostics & Validation**
- simulate customer cohorts
- compare simulated vs. real distributions
- identify structural gaps before adding economics
