In [2]:
import pandas as pd
import numpy as np

# -----------------------------
# Params (LOCKED)
# -----------------------------
HORIZON_DAYS = 1095
K_MAX = 5

# -----------------------------
# Inputs (assumptions)
# -----------------------------
# Expecting canonical dataset with at least:
# - anon (customer id)
# - date (purchase date)
# - is_purchase (boolean)
#
# Example: cust_day_group is already loaded from parquet
# cust_day_group = pd.read_parquet("...")

cust_day_group = pd.read_parquet("../data/interim/cust_day_group.parquet")

df = cust_day_group.copy()

# Ensure date is clean + normalized to day
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.normalize()
df = df.dropna(subset=["anon", "date"])

# Keep purchase rows only
dfp = df[df["is_purchase"]].copy()

# -----------------------------
# 1) Collapse to unique (anon, date) purchase-days
# -----------------------------
orders = (
    dfp[["anon", "date"]]
    .drop_duplicates()
    .sort_values(["anon", "date"])
    .reset_index(drop=True)
)

# -----------------------------
# 2) First purchase date per anon + customer clock
# -----------------------------
orders["first_date"] = orders.groupby("anon")["date"].transform("min")
orders["days_since_first"] = (orders["date"] - orders["first_date"]).dt.days

# Horizon filter: keep 0..1095 days inclusive
orders = orders[
    (orders["days_since_first"] >= 0) &
    (orders["days_since_first"] <= HORIZON_DAYS)
].copy()

# -----------------------------
# 3) Rebuild purchase_k *within horizon*
# -----------------------------
# purchase_k = 1 for the first purchase day, 2 for the second, ...
orders["purchase_k"] = orders.groupby("anon").cumcount() + 1

# Tail bucket for truncation logic (keeps full purchase_k too)
orders["k_bucket"] = np.where(
    orders["purchase_k"] >= (K_MAX + 1),
    "6+",
    orders["purchase_k"].astype(str)
)

# Convenience flag: is this row within the modeled truncation (k<=5)
orders["in_k_leq_5"] = orders["purchase_k"] <= K_MAX

# Optional: how many purchases each customer has within horizon
orders["n_purchases_in_horizon"] = orders.groupby("anon")["purchase_k"].transform("max")

# -----------------------------
# 4) Final order-level foundation table
# -----------------------------
order_timeline = orders[[
    "anon",
    "date",
    "first_date",
    "days_since_first",
    "purchase_k",
    "k_bucket",
    "in_k_leq_5",
    "n_purchases_in_horizon",
]].sort_values(["anon", "date"]).reset_index(drop=True)

order_timeline.head(10)


  df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.normalize()


Unnamed: 0,anon,date,first_date,days_since_first,purchase_k,k_bucket,in_k_leq_5,n_purchases_in_horizon
0,ANON_0000001,2022-11-09,2022-11-09,0,1,1,True,1
1,ANON_0000002,2022-11-09,2022-11-09,0,1,1,True,1
2,ANON_0000003,2022-11-09,2022-11-09,0,1,1,True,1
3,ANON_0000004,2022-11-09,2022-11-09,0,1,1,True,1
4,ANON_0000005,2022-11-09,2022-11-09,0,1,1,True,1
5,ANON_0000006,2022-11-09,2022-11-09,0,1,1,True,1
6,ANON_0000007,2022-11-09,2022-11-09,0,1,1,True,1
7,ANON_0000008,2022-11-09,2022-11-09,0,1,1,True,1
8,ANON_0000009,2022-11-10,2022-11-10,0,1,1,True,1
9,ANON_0000010,2022-11-10,2022-11-10,0,1,1,True,1


In [4]:
# Choose your project path convention
# Example:
# order_timeline.to_parquet("data/interim/order_timeline_3y_k5.parquet", index=False)

order_timeline.to_parquet("../data/interim/order_timeline_3y_k5.parquet", index=False)


In [13]:
entry = pd.read_parquet("../data/interim/entry_counts.parquet")

T12 = pd.read_parquet("../data/interim/transition_1_to_2.parquet")
T23 = pd.read_parquet("../data/interim/transition_2_to_3.parquet")
T34 = pd.read_parquet("../data/interim/transition_3_to_4.parquet")
T45 = pd.read_parquet("../data/interim/transition_4_to_5.parquet")


In [14]:
import numpy as np

def check_row_sums(T, tol=1e-6):
    row_sums = T.sum(axis=1)
    bad = row_sums[~np.isclose(row_sums, 1.0, atol=tol)]
    return bad.sort_values(ascending=False)

# Example usage
check_row_sums(T12)
check_row_sums(T23)
check_row_sums(T34)
check_row_sums(T45)


Series([], dtype: float64)