In [12]:
import duckdb
import pandas as pd
import os
from pathlib import Path

p = Path.cwd()
while p != p.parent and not (p / "README.md").exists():
    p = p.parent

os.chdir(p)
print("Repo root set to:", Path.cwd())



con = duckdb.connect()
con.execute("SET temp_directory='data/processed/tmp';")
con.execute("PRAGMA threads=4;")
con.execute("PRAGMA memory_limit='1GB';")

con.execute("""
CREATE OR REPLACE VIEW events AS
SELECT * FROM read_parquet([
  'data/processed/events_2019_oct.parquet',
  'data/processed/events_2019_nov.parquet'
]);
""")

con.execute("""
CREATE OR REPLACE VIEW events_enriched AS
SELECT
  *,
  DATE(event_time) AS event_date,
  STRFTIME(event_time, '%Y-%m') AS event_month,
  STRFTIME(event_time, '%Y-%W') AS event_week
FROM events;
""")


Repo root set to: c:\Users\victo\OneDrive\Área de Trabalho\Data Analyst\growth-funnel-analytics


<_duckdb.DuckDBPyConnection at 0x1df449c69b0>

In [13]:
con.execute("""
SELECT event_month, COUNT(*) AS events
FROM events_enriched
GROUP BY event_month
ORDER BY event_month
""").df()


Unnamed: 0,event_month,events
0,2019-10,42448764
1,2019-11,67501979


In [21]:
# Funnel by month (SEQUENTIAL) — correct for step-to-step conversion
funnel_by_month = con.execute("""
WITH user_steps AS (
  SELECT
    event_month,
    user_id,
    MAX(CASE WHEN event_type='view' THEN 1 ELSE 0 END) AS did_view,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  GROUP BY event_month, user_id
)
SELECT
  event_month,
  COUNT(*) AS total_users,

  -- Step counts (sequential funnel)
  SUM(did_view) AS view_users,
  SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END) AS cart_users,
  SUM(CASE WHEN did_cart=1 AND did_purchase=1 THEN 1 ELSE 0 END) AS purchase_users,

  -- Sequential conversion rates
  SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(did_view),0) AS view_to_cart_rate,
  SUM(CASE WHEN did_cart=1 AND did_purchase=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(CASE WHEN did_view=1 AND did_cart=1 THEN 1 ELSE 0 END),0) AS cart_to_purchase_rate,
  SUM(CASE WHEN did_view=1 AND did_purchase=1 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(did_view),0) AS view_to_purchase_rate

FROM user_steps
GROUP BY event_month
ORDER BY event_month;
""").df()

funnel_by_month


Unnamed: 0,event_month,total_users,view_users,cart_users,purchase_users,view_to_cart_rate,cart_to_purchase_rate,view_to_purchase_rate
0,2019-10,3022290,3022130.0,336996.0,202777.0,0.111509,0.601719,0.114838
1,2019-11,3696117,3695598.0,825970.0,400047.0,0.223501,0.484336,0.119417


In [22]:
# Tracking gap / direct purchase: users who purchased without a cart event
purchase_without_cart_by_month = con.execute("""
WITH user_steps AS (
  SELECT
    event_month,
    user_id,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  GROUP BY event_month, user_id
)
SELECT
  event_month,
  SUM(CASE WHEN did_purchase=1 THEN 1 ELSE 0 END) AS purchase_users_any,
  SUM(CASE WHEN did_purchase=1 AND did_cart=0 THEN 1 ELSE 0 END) AS purchase_users_without_cart,
  SUM(CASE WHEN did_purchase=1 AND did_cart=0 THEN 1 ELSE 0 END)*1.0 / NULLIF(SUM(CASE WHEN did_purchase=1 THEN 1 ELSE 0 END),0) AS share_purchase_without_cart
FROM user_steps
GROUP BY event_month
ORDER BY event_month;
""").df()

purchase_without_cart_by_month


Unnamed: 0,event_month,purchase_users_any,purchase_users_without_cart,share_purchase_without_cart
0,2019-10,347118.0,144341.0,0.415827
1,2019-11,441638.0,41591.0,0.094174


In [15]:
price_funnel = con.execute("""
WITH base AS (
  SELECT
    event_month,
    user_id,
    CASE 
      WHEN price < 20 THEN 'low'
      WHEN price BETWEEN 20 AND 100 THEN 'mid'
      ELSE 'high'
    END AS price_segment,
    MAX(CASE WHEN event_type='view' THEN 1 ELSE 0 END) AS did_view,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  WHERE price IS NOT NULL
  GROUP BY event_month, user_id, price_segment
)
SELECT
  event_month,
  price_segment,
  SUM(did_view) AS viewers,
  SUM(did_cart) AS carters,
  SUM(did_purchase) AS buyers,
  ROUND(SUM(did_cart)*1.0 / NULLIF(SUM(did_view),0), 4) AS view_to_cart_rate,
  ROUND(SUM(did_purchase)*1.0 / NULLIF(SUM(did_view),0), 4) AS view_to_purchase_rate
FROM base
GROUP BY event_month, price_segment
ORDER BY event_month, price_segment;
""").df()

price_funnel


Unnamed: 0,event_month,price_segment,viewers,carters,buyers,view_to_cart_rate,view_to_purchase_rate
0,2019-10,high,2510760.0,273284.0,252549.0,0.1088,0.1006
1,2019-10,low,527919.0,10841.0,19782.0,0.0205,0.0375
2,2019-10,mid,1489304.0,76936.0,115603.0,0.0517,0.0776
3,2019-11,high,3042200.0,617107.0,323579.0,0.2028,0.1064
4,2019-11,low,728216.0,62066.0,23180.0,0.0852,0.0318
5,2019-11,mid,1918554.0,294823.0,151482.0,0.1537,0.079


In [16]:
category_funnel = con.execute("""
WITH base AS (
  SELECT
    event_month,
    user_id,
    category_code,
    MAX(CASE WHEN event_type='view' THEN 1 ELSE 0 END) AS did_view,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events_enriched
  WHERE category_code IS NOT NULL
  GROUP BY event_month, user_id, category_code
),
top_categories AS (
  SELECT category_code
  FROM base
  GROUP BY category_code
  ORDER BY SUM(did_view) DESC
  LIMIT 10
)
SELECT
  b.event_month,
  b.category_code,
  SUM(did_view) AS viewers,
  SUM(did_cart) AS carters,
  SUM(did_purchase) AS buyers,
  ROUND(SUM(did_cart)*1.0 / NULLIF(SUM(did_view),0), 4) AS view_to_cart_rate,
  ROUND(SUM(did_purchase)*1.0 / NULLIF(SUM(did_view),0), 4) AS view_to_purchase_rate
FROM base b
JOIN top_categories t ON b.category_code = t.category_code
GROUP BY b.event_month, b.category_code
ORDER BY b.event_month, viewers DESC;
""").df()

category_funnel


Unnamed: 0,event_month,category_code,viewers,carters,buyers,view_to_cart_rate,view_to_purchase_rate
0,2019-10,electronics.smartphone,1300070.0,198347.0,160437.0,0.1526,0.1234
1,2019-10,electronics.audio.headphone,213948.0,22482.0,18826.0,0.1051,0.088
2,2019-10,electronics.clocks,211922.0,9481.0,10746.0,0.0447,0.0507
3,2019-10,electronics.video.tv,170044.0,16672.0,13476.0,0.098,0.0793
4,2019-10,appliances.kitchen.washer,132028.0,9948.0,10480.0,0.0753,0.0794
5,2019-10,appliances.kitchen.refrigerators,131602.0,6477.0,7656.0,0.0492,0.0582
6,2019-10,computers.notebook,129207.0,7474.0,9185.0,0.0578,0.0711
7,2019-10,apparel.shoes,126760.0,0.0,2757.0,0.0,0.0217
8,2019-10,appliances.environment.vacuum,119925.0,8527.0,9191.0,0.0711,0.0766
9,2019-10,apparel.shoes.keds,91620.0,0.0,1951.0,0.0,0.0213


In [None]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# ---------- output ----------
ASSETS = Path("../docs/assets")  # se o notebook estiver dentro de /notebooks
ASSETS.mkdir(parents=True, exist_ok=True)

# ---------- palette (clean / executive) ----------
C_VIEW = "#1D4ED8"      # blue
C_CART = "#94A3B8"      # slate
C_PURCHASE = "#10B981"  # green
GRID = "#E5E7EB"
TEXT = "#0F172A"
MUTED = "#475569"

def fmt_pct(x): return f"{x*100:.1f}%"
def fmt_m(x): return f"{x/1_000_000:.2f}M"
def mom(a, b): return (b-a)/a if a else np.nan

# ---------- data ----------
df = funnel_by_month.copy()
df["event_month"] = df["event_month"].astype(str)

rate_cols = ["view_to_cart_rate", "cart_to_purchase_rate", "view_to_purchase_rate"]
for c in rate_cols:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan).fillna(0)

vc_mom = mom(df.loc[0, "view_to_cart_rate"], df.loc[1, "view_to_cart_rate"])
cp_mom = mom(df.loc[0, "cart_to_purchase_rate"], df.loc[1, "cart_to_purchase_rate"])
vp_mom = mom(df.loc[0, "view_to_purchase_rate"], df.loc[1, "view_to_purchase_rate"])

# ---------- style ----------
plt.rcParams.update({
    "font.family": "DejaVu Sans",
    "axes.edgecolor": GRID,
    "axes.labelcolor": MUTED,
    "xtick.color": MUTED,
    "ytick.color": MUTED,
    "text.color": TEXT
})

fig, ax = plt.subplots(1, 2, figsize=(14.5, 5.2), dpi=200)
fig.patch.set_facecolor("white")

x = np.arange(len(df))
w = 0.22

# ===== left: conversion rates =====
ax0 = ax[0]
ax0.set_title("Conversion (Sequential)", fontsize=13, fontweight="semibold", pad=12)

ax0.bar(x - w, df["view_to_cart_rate"], width=w, color=C_VIEW, label="View → Cart")
ax0.bar(x,     df["cart_to_purchase_rate"], width=w, color=C_CART, label="Cart → Purchase")
ax0.bar(x + w, df["view_to_purchase_rate"], width=w, color=C_PURCHASE, label="View → Purchase")

ax0.set_xticks(x)
ax0.set_xticklabels(df["event_month"])
ax0.set_ylabel("Rate")

ymax = float(df[rate_cols].max().max()) * 1.28
ax0.set_ylim(0, ymax if ymax > 0 else 1)

ax0.grid(axis="y", color=GRID, linewidth=1, alpha=0.7)
ax0.set_axisbelow(True)
ax0.spines["top"].set_visible(False)
ax0.spines["right"].set_visible(False)

for i in range(len(df)):
    vals = [df.loc[i, c] for c in rate_cols]
    xpos = [x[i]-w, x[i], x[i]+w]
    for px, v in zip(xpos, vals):
        ax0.text(px, v + ymax*0.02, fmt_pct(v), ha="center", va="bottom", fontsize=9, color=MUTED)

leg0 = ax0.legend(frameon=False, loc="upper right")
for t in leg0.get_texts(): t.set_color(MUTED)

# ===== right: volumes =====
ax1 = ax[1]
ax1.set_title("Volumes (Users)", fontsize=13, fontweight="semibold", pad=12)

ax1.bar(x - w, df["view_users"], width=w, color=C_VIEW, label="View users")
ax1.bar(x,     df["cart_users"], width=w, color=C_CART, label="Cart users")
ax1.bar(x + w, df["purchase_users"], width=w, color=C_PURCHASE, label="Purchase users")

ax1.set_xticks(x)
ax1.set_xticklabels(df["event_month"])
ax1.set_ylabel("Users")

ax1.grid(axis="y", color=GRID, linewidth=1, alpha=0.7)
ax1.set_axisbelow(True)
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)

for i in range(len(df)):
    ax1.text(x[i]-w, df.loc[i,"view_users"]*1.01, fmt_m(df.loc[i,"view_users"]), ha="center", va="bottom", fontsize=9, color=MUTED)
    ax1.text(x[i],   df.loc[i,"cart_users"]*1.01, fmt_m(df.loc[i,"cart_users"]), ha="center", va="bottom", fontsize=9, color=MUTED)
    ax1.text(x[i]+w, df.loc[i,"purchase_users"]*1.01, fmt_m(df.loc[i,"purchase_users"]), ha="center", va="bottom", fontsize=9, color=MUTED)

leg1 = ax1.legend(frameon=False, loc="upper right")
for t in leg1.get_texts(): t.set_color(MUTED)

# ===== header =====
headline = "Oct vs Nov 2019 — Funnel (Users + Conversion)"
insight = f"MoM: View→Cart {vc_mom:+.0%} · Cart→Purchase {cp_mom:+.0%} · View→Purchase {vp_mom:+.0%}"

fig.suptitle(headline, fontsize=16, fontweight="bold", y=1.02, color=TEXT)
fig.text(0.5, 0.985, insight, ha="center", va="top", fontsize=11, color=MUTED)

note = "Note: Cart→Purchase is computed only among users who added to cart; some purchases occur without a tracked cart event."
fig.text(0.01, 0.01, note, ha="left", va="bottom", fontsize=9, color=MUTED)

out = ASSETS / "funnel.png"
plt.tight_layout(rect=[0, 0.03, 1, 0.93])
plt.savefig(out, bbox_inches="tight")
plt.close()

print("Saved:", out)


Saved: assets/funnel_users_and_conversion.png
