In [None]:
from pathlib import Path
import sys

repo_root = Path.cwd()
for candidate in [repo_root, *repo_root.parents]:
    setup_dir = candidate / 'notebooks' / '00_setup'
    if setup_dir.exists():
        break
else:
    raise RuntimeError('Could not locate notebooks/00_setup from this notebook location.')

setup_dir_str = str(setup_dir)
if setup_dir_str not in sys.path:
    sys.path.insert(0, setup_dir_str)

from bootstrap import ensure_src_on_path
ensure_src_on_path()

from sql_for_analysis.db.connection import get_connection


In [None]:
from sql_for_analysis.db.connection import get_connection
from tabulate import tabulate


### Create Sample Orders Table

```sql
-- Create the orders table
CREATE TABLE orders (
    order_id        INT             PRIMARY KEY,
    customer_id     INT,
    order_date      DATE,
    product_category VARCHAR(50),
    revenue         DECIMAL(10,2),
    region          VARCHAR(50)
);

-- Insert sample data
INSERT INTO orders (order_id, customer_id, order_date, product_category, revenue, region) VALUES
(1,  101, '2024-01-15', 'Electronics', 1200.00, 'East'),
(2,  102, '2024-01-16', 'Electronics',  800.00, 'West'),
(3,  101, '2024-01-17', 'Clothing',     150.00, 'East'),
(4,  103, '2024-01-18', 'Electronics', 2500.00, 'East'),
(5,  102, '2024-01-19', 'Clothing',     200.00, 'West'),
(6,  104, '2024-01-20', 'Electronics',  950.00, 'South'),
(7,  101, '2024-01-21', 'Electronics', 1800.00, 'East'),
(8,  105, '2024-01-22', 'Clothing',     300.00, 'East'),
(9,  103, '2024-01-23', 'Electronics', 1100.00, 'East'),
(10, 102, '2024-01-24', 'Clothing',     175.00, 'West');

### window function key topics
- Ranking data (find top N per category)
- Running totals and moving averages
- Comparing rows (previous/next values)
- Deduplication in ETL pipelines

ranking orders  - 
Use Case: Assign a unique sequential number to each order per customer

In [None]:
sql = """
SELECT
    order_id,
    customer_id,
    order_date,
    revenue,
    -- Assign row number within each customer's orders
    ROW_NUMBER() OVER (
        PARTITION BY customer_id
        ORDER BY order_date
    ) AS order_sequence
FROM orders
ORDER BY customer_id, order_date;
"""

In [None]:
with get_connection() as connection:
    with connection.cursor() as cursor:
        cursor.execute(sql)
        rows = cursor.fetchall()
        headers = [desc[0] for desc in cursor.description]

print(tabulate(rows, headers=headers, tablefmt="psql"))


Example 2: RANK() vs DENSE_RANK() - Top Products by Revenue
 - Use Case: Find top-selling products within each category

In [None]:
sql = """
SELECT
    order_id,
    product_category,
    revenue,
    -- RANK: Leaves gaps after ties (1,2,2,4)
    RANK() OVER (
        PARTITION BY product_category
        ORDER BY revenue DESC
    ) AS rank_with_gaps,
    -- DENSE_RANK: No gaps after ties (1,2,2,3)
    DENSE_RANK() OVER (
        PARTITION BY product_category
        ORDER BY revenue DESC
    ) AS dense_rank
FROM orders
ORDER BY product_category, revenue DESC;
"""

In [None]:
with get_connection() as connection:
    with connection.cursor() as cursor:
        cursor.execute(sql)
        rows = cursor.fetchall()
        headers = [desc[0] for desc in cursor.description]

print(tabulate(rows, headers=headers, tablefmt="psql"))
