# Week 4 — Data Warehousing & Star Schema (DuckDB)
**Goal:** Build a simple star schema and run BI queries with data quality checks.

## 0) Setup

In [None]:
# !pip -q install duckdb pandas
import duckdb, pandas as pd, io
duckdb.__version__

## 1) Staging tables

In [None]:
orders_csv = '''order_id,customer_id,product_id,quantity,amount,order_ts
2001,1,101,1,120.50,2024-06-01
2002,2,103,2,170.00,2024-06-02
2003,3,102,1,89.99,2024-06-02
2004,1,103,3,255.60,2024-06-03
2005,4,101,1,120.50,2024-06-03
'''
customers_csv = '''customer_id,customer_name,country,segment
1,Alice,US,Consumer
2,Bob,UK,Corporate
3,Chandra,IN,Small Business
4,Eva,DE,Consumer
'''
products_csv = '''product_id,product_name,category,unit_price
101,Widget A,Widgets,120.50
102,Widget B,Widgets,89.99
103,Gadget C,Gadgets,85.80
'''

stg_orders = pd.read_csv(io.StringIO(orders_csv), parse_dates=['order_ts'])
stg_customers = pd.read_csv(io.StringIO(customers_csv))
stg_products = pd.read_csv(io.StringIO(products_csv))

stg_orders.head()

## 2) Star schema in DuckDB

In [None]:
con = duckdb.connect(database=':memory:')
con.register('stg_orders', stg_orders)
con.register('stg_customers', stg_customers)
con.register('stg_products', stg_products)

con.execute("""CREATE TABLE dim_customer AS
SELECT customer_id, customer_name, country, segment FROM stg_customers;
""")

con.execute("""CREATE TABLE dim_product AS
SELECT product_id, product_name, category, unit_price FROM stg_products;
""")

con.execute("""CREATE TABLE fact_sales AS
SELECT o.order_id, o.customer_id, o.product_id,
       CAST(o.order_ts AS DATE) AS order_date,
       o.quantity, o.amount
FROM stg_orders o;
""")

## 3) BI queries

In [None]:
rev_country = con.execute("""SELECT c.country, ROUND(SUM(f.amount),2) AS revenue
FROM fact_sales f JOIN dim_customer c USING(customer_id)
GROUP BY c.country ORDER BY revenue DESC;
""").fetchdf()

top_products = con.execute("""SELECT p.product_name, ROUND(SUM(f.amount),2) AS revenue
FROM fact_sales f JOIN dim_product p USING(product_id)
GROUP BY p.product_name ORDER BY revenue DESC;
""").fetchdf()

daily = con.execute("""SELECT order_date, COUNT(*) AS orders, ROUND(SUM(amount),2) AS revenue
FROM fact_sales GROUP BY order_date ORDER BY order_date;
""").fetchdf()

rev_country, top_products, daily

## 4) Data quality checks

In [None]:
nulls = con.execute("""SELECT 
  SUM(CASE WHEN order_id IS NULL THEN 1 ELSE 0 END) AS null_order_id,
  SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END) AS null_customer_id,
  SUM(CASE WHEN product_id IS NULL THEN 1 ELSE 0 END) AS null_product_id,
  SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) AS null_amount
FROM fact_sales;
""").fetchdf()

dupes = con.execute("""SELECT order_id, COUNT(*) AS cnt FROM fact_sales
GROUP BY order_id HAVING COUNT(*)>1;
""").fetchdf()

nulls, dupes