In [None]:
import duckdb
import os
from pathlib import Path

# Set working directory to repo root
p = Path.cwd()
while p != p.parent and not (p / "README.md").exists():
    p = p.parent
os.chdir(p)
print("Repo root set to:", Path.cwd())

con = duckdb.connect()
con.execute("SET temp_directory='data/processed/tmp';")
con.execute("PRAGMA threads=4;")
con.execute("PRAGMA memory_limit='1GB';")

con.execute("""
CREATE OR REPLACE VIEW events AS
SELECT * FROM read_parquet([
  'data/processed/events_2019_oct.parquet',
  'data/processed/events_2019_nov.parquet'
]);
""")

con.execute("""
CREATE OR REPLACE VIEW events_enriched AS
SELECT
  *,
  DATE(event_time) AS event_date,
  STRFTIME(event_time, '%Y-%m') AS event_month
FROM events;
""")


In [None]:
# Funnel by month (sequential conversion)
funnel_by_month = con.execute("""
WITH user_steps AS (
  SELECT
    event_month,
    user_id,
    MAX(CASE WHEN event_type='view' THEN 1 ELSE 0 END) AS did_view,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  GROUP BY event_month, user_id
)
SELECT
  event_month,
  SUM(did_view) AS view_users,
  SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END) AS cart_users,
  SUM(CASE WHEN did_cart=1 AND did_purchase=1 THEN 1 ELSE 0 END) AS purchase_users,
  SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(did_view),0) AS view_to_cart_rate,
  SUM(CASE WHEN did_cart=1 AND did_purchase=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END),0) AS cart_to_purchase_rate,
  SUM(CASE WHEN did_view=1 AND did_purchase=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(did_view),0) AS view_to_purchase_rate
FROM user_steps
GROUP BY event_month
ORDER BY event_month;
""").df()

funnel_by_month


In [None]:
# Tracking quality check: purchases without cart event
purchase_without_cart_by_month = con.execute("""
WITH user_steps AS (
  SELECT
    event_month,
    user_id,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  GROUP BY event_month, user_id
)
SELECT
  event_month,
  SUM(CASE WHEN did_purchase=1 THEN 1 ELSE 0 END) AS purchase_users_any,
  SUM(CASE WHEN did_purchase=1 AND did_cart=0 THEN 1 ELSE 0 END) AS purchase_users_without_cart,
  SUM(CASE WHEN did_purchase=1 AND did_cart=0 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(CASE WHEN did_purchase=1 THEN 1 ELSE 0 END),0) AS share_purchase_without_cart
FROM user_steps
GROUP BY event_month
ORDER BY event_month;
""").df()

purchase_without_cart_by_month


In [None]:
import numpy as np
import matplotlib.pyplot as plt

ASSETS = Path('assets')
ASSETS.mkdir(parents=True, exist_ok=True)

C_VIEW = '#1D4ED8'
C_CART = '#94A3B8'
C_PURCHASE = '#10B981'
GRID = '#E5E7EB'
MUTED = '#475569'


def fmt_pct(x):
    return f"{x*100:.1f}%"


def fmt_m(x):
    return f"{x/1_000_000:.2f}M"


df = funnel_by_month.copy()
df['event_month'] = df['event_month'].astype(str)

fig, ax = plt.subplots(1, 2, figsize=(14, 5), dpi=200)
x = np.arange(len(df))
w = 0.22

# Conversion chart
ax0 = ax[0]
ax0.bar(x - w, df['view_to_cart_rate'], width=w, color=C_VIEW, label='View -> Cart')
ax0.bar(x, df['cart_to_purchase_rate'], width=w, color=C_CART, label='Cart -> Purchase')
ax0.bar(x + w, df['view_to_purchase_rate'], width=w, color=C_PURCHASE, label='View -> Purchase')
ax0.set_xticks(x)
ax0.set_xticklabels(df['event_month'])
ax0.set_ylabel('Rate')
ax0.grid(axis='y', color=GRID, linewidth=1, alpha=0.7)
ax0.spines['top'].set_visible(False)
ax0.spines['right'].set_visible(False)
ax0.legend(frameon=False, loc='upper right')

max_rate = float(df[['view_to_cart_rate', 'cart_to_purchase_rate', 'view_to_purchase_rate']].max().max()) * 1.28
for i in range(len(df)):
    for px, v in [(x[i]-w, df.loc[i, 'view_to_cart_rate']), (x[i], df.loc[i, 'cart_to_purchase_rate']), (x[i]+w, df.loc[i, 'view_to_purchase_rate'])]:
        ax0.text(px, v + max_rate*0.02, fmt_pct(v), ha='center', va='bottom', fontsize=9, color=MUTED)

# Volume chart
ax1 = ax[1]
ax1.bar(x - w, df['view_users'], width=w, color=C_VIEW, label='View users')
ax1.bar(x, df['cart_users'], width=w, color=C_CART, label='Cart users')
ax1.bar(x + w, df['purchase_users'], width=w, color=C_PURCHASE, label='Purchase users')
ax1.set_xticks(x)
ax1.set_xticklabels(df['event_month'])
ax1.set_ylabel('Users')
ax1.grid(axis='y', color=GRID, linewidth=1, alpha=0.7)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.legend(frameon=False, loc='upper right')

for i in range(len(df)):
    ax1.text(x[i]-w, df.loc[i, 'view_users']*1.01, fmt_m(df.loc[i, 'view_users']), ha='center', va='bottom', fontsize=9, color=MUTED)
    ax1.text(x[i], df.loc[i, 'cart_users']*1.01, fmt_m(df.loc[i, 'cart_users']), ha='center', va='bottom', fontsize=9, color=MUTED)
    ax1.text(x[i]+w, df.loc[i, 'purchase_users']*1.01, fmt_m(df.loc[i, 'purchase_users']), ha='center', va='bottom', fontsize=9, color=MUTED)

out = ASSETS / 'funnel.png'
plt.tight_layout()
plt.savefig(out, bbox_inches='tight')
plt.close()
print('Saved:', out)
