In [4]:
# Make sure Python can import from the repo root (which contains `src/`)
import sys, os
from pathlib import Path

# Add the repo root (parent of this notebooks folder) to sys.path
ROOT = Path.cwd().resolve()
if (ROOT / "src").exists():
    sys.path.insert(0, str(ROOT))              # notebooks are at repo root
elif (ROOT.parent / "src").exists():
    sys.path.insert(0, str(ROOT.parent))       # notebooks/ is under the root
elif (ROOT.parent.parent / "src").exists():
    sys.path.insert(0, str(ROOT.parent.parent))# olist/notebooks/ style

# now this import will work
from src.db import get_engine
engine = get_engine()


In [5]:
import pandas as pd

# 1) Total customers (unique people)
total_customers = pd.read_sql("""
SELECT COUNT(DISTINCT customer_unique_id) AS total_customers
FROM olist.dim_customer;
""", engine)
total_customers


Unnamed: 0,total_customers
0,96096


In [6]:
# 2) Top 20 customers by # of orders
top20 = pd.read_sql("""
SELECT
  dc.customer_unique_id,
  COUNT(DISTINCT foi.order_id) AS orders
FROM olist.fact_order_item AS foi
JOIN olist.dim_customer AS dc
  ON dc.customer_key = foi.customer_key
GROUP BY dc.customer_unique_id
ORDER BY orders DESC, dc.customer_unique_id
LIMIT 20;
""", engine)
top20


Unnamed: 0,customer_unique_id,orders
0,8d50f5eadf50201ccdcedfb9e2ac8455,16
1,3e43e6105506432c953e165fb2acf44c,9
2,1b6c7548a2a1f9037c1fd3ddfed95f33,7
3,6469f99c1f9dfae7733b25662e7f1782,7
4,ca77025e7201e3b30c44b472ff346268,7
5,12f5d6e1cbf93dafd9dcc19095df0b3d,6
6,47c1a3033b8b77b3ab6e109eb4d5fdf3,6
7,63cfc61cee11cbe306bff5857d00bfe4,6
8,dc813062e0fc23409cd255f7f53c7074,6
9,f0e310a6839dce9de1638e0fe5ab282a,6


In [7]:
# 3) Average frequency among returning customers (orders > 1)
avg_returning = pd.read_sql("""
WITH per_customer AS (
  SELECT
    dc.customer_unique_id,
    COUNT(DISTINCT foi.order_id) AS orders
  FROM olist.fact_order_item AS foi
  JOIN olist.dim_customer AS dc
    ON dc.customer_key = foi.customer_key
  GROUP BY dc.customer_unique_id
)
SELECT
  AVG(orders)::numeric(10,2)       AS avg_orders_returning,
  COUNT(*)                         AS returning_customers,
  SUM(CASE WHEN orders = 1 THEN 1 ELSE 0 END) AS one_time_buyers
FROM per_customer
WHERE orders > 1;
""", engine)
avg_returning


Unnamed: 0,avg_orders_returning,returning_customers,one_time_buyers
0,2.11,2913,0


In [8]:
repeat_rate = pd.read_sql("""
WITH per_customer AS (
  SELECT
    dc.customer_unique_id,
    COUNT(DISTINCT foi.order_id) AS orders
  FROM olist.fact_order_item AS foi
  JOIN olist.dim_customer AS dc
    ON dc.customer_key = foi.customer_key
  GROUP BY dc.customer_unique_id
)
SELECT
  ROUND(AVG(CASE WHEN orders > 1 THEN 1 ELSE 0 END)::numeric, 4) AS repeat_rate
FROM per_customer;
""", engine)
repeat_rate


Unnamed: 0,repeat_rate
0,0.0305
