# Capstone Add-On — Loads, Scheduling, Monitoring, Params, Medallion

Hands-on tasks that tie advanced topics into the Real Estate Analytics capstone.


## Environment Setup

In [None]:
import sys, sqlite3, pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
print(sys.version)
import seaborn as sns; sns.set_theme()
DB_PATH = Path('course.db')
conn = sqlite3.connect(DB_PATH)
conn.execute('PRAGMA foreign_keys=ON;')
print('SQLite ready at', DB_PATH.resolve())
def run_sql(q, params=None):
    params = params or {}
    df = pd.read_sql_query(q, conn, params=params)
    display(df)
    return df

### Apply Full vs Incremental Loads to Capstone Tables

In [None]:
# Recreate capstone tables quickly (if not present) and add updated_at columns for incremental demo
conn.executescript('''
CREATE TABLE IF NOT EXISTS locations(location_id INTEGER PRIMARY KEY, city TEXT, state TEXT);
CREATE TABLE IF NOT EXISTS homes(home_id INTEGER PRIMARY KEY, location_id INTEGER, bedrooms INTEGER, bathrooms REAL, sqft INTEGER, home_type TEXT, year_built INTEGER);
CREATE TABLE IF NOT EXISTS sales(
  sale_id INTEGER PRIMARY KEY, home_id INTEGER, agent_id INTEGER, sale_date TEXT, sale_price REAL,
  updated_at TEXT
);
'''); conn.commit()

# Seed minimal if empty
rows = pd.read_sql_query("SELECT COUNT(*) AS n FROM sales", conn).iloc[0]['n']
if rows == 0:
    conn.executescript('''
    INSERT INTO locations VALUES (1,'Austin','TX'), (2,'Boston','MA');
    INSERT INTO homes VALUES (1,1,3,2.0,1600,'SingleFamily',2002),(2,2,2,1.0,900,'Condo',1995);
    INSERT INTO sales VALUES (1,1,1,'2024-05-01',420000,'2024-10-01'), (2,2,2,'2024-06-01',390000,'2024-10-01');
    '''); conn.commit()

# Create DW table
conn.executescript('''
DROP TABLE IF EXISTS dw_sales;
CREATE TABLE dw_sales AS SELECT * FROM sales;
'''); conn.commit()
display(pd.read_sql_query("SELECT * FROM dw_sales", conn))

# Simulate increment
conn.execute("INSERT INTO sales VALUES (3,1,1,'2024-07-01',430000,'2024-10-06')"); conn.commit()
wm = pd.read_sql_query("SELECT MAX(updated_at) AS wm FROM dw_sales", conn).iloc[0]['wm']
changes = pd.read_sql_query("SELECT * FROM sales WHERE updated_at > ?", conn, params=[wm])
display(changes)
conn.executemany("INSERT OR REPLACE INTO dw_sales VALUES (?,?,?,?,?,?)", changes.values.tolist()); conn.commit()
print("DW after incremental:")
display(pd.read_sql_query("SELECT * FROM dw_sales ORDER BY sale_id", conn))

### Scheduling Exercise: Outline a daily incremental job for `dw_sales`

In [None]:
print("""
# cron (2:10 AM daily)
10 2 * * * python3 /opt/pipelines/capstone_incremental.py >> /var/log/capstone_incremental.log 2>&1
"""))

### Monitoring Exercise: Validate non-decreasing sales rows and max date

In [None]:
before = pd.read_sql_query("SELECT COUNT(*) AS n, MAX(updated_at) AS mx FROM dw_sales", conn).iloc[0]
# simulate another change
conn.execute("UPDATE sales SET sale_price = sale_price + 5000, updated_at = '2024-10-07' WHERE sale_id=3"); conn.commit()
wm = pd.read_sql_query("SELECT MAX(updated_at) AS wm FROM dw_sales", conn).iloc[0]['wm']
delta = pd.read_sql_query("SELECT * FROM sales WHERE updated_at > ?", conn, params=[wm])
conn.executemany("INSERT OR REPLACE INTO dw_sales VALUES (?,?,?,?,?,?)", delta.values.tolist()); conn.commit()
after = pd.read_sql_query("SELECT COUNT(*) AS n, MAX(updated_at) AS mx FROM dw_sales", conn).iloc[0]
print("Before:", dict(before)); print("After:", dict(after))
if after['n'] < before['n'] or after['mx'] < before['mx']:
    print("ALERT: potential regression detected")
else:
    print("OK: metrics non-decreasing")

### Parameterization Exercise: Use a config for table and watermark column

In [None]:
CONFIG = {"table": "dw_sales", "wm_col": "updated_at"}
def watermark(conn, table, col):
    return pd.read_sql_query(f"SELECT MAX({col}) AS wm FROM {table}", conn).iloc[0]['wm']
print("Watermark:", watermark(conn, CONFIG["table"], CONFIG["wm_col"]))

### Medallion Exercise: Bronze→Silver→Gold for `sales`

In [None]:
conn.executescript('''
DROP TABLE IF EXISTS bronze_sales;
DROP TABLE IF EXISTS silver_sales;
DROP TABLE IF EXISTS gold_sales_kpis;
CREATE TABLE bronze_sales AS SELECT * FROM sales;
CREATE TABLE silver_sales AS
  SELECT sale_id, home_id, agent_id, sale_date, sale_price, updated_at
  FROM bronze_sales WHERE sale_price IS NOT NULL;
CREATE TABLE gold_sales_kpis AS
  SELECT strftime('%Y-%m', sale_date) AS yyyymm, COUNT(*) AS n_sales, ROUND(AVG(sale_price),2) AS avg_price
  FROM silver_sales GROUP BY strftime('%Y-%m', sale_date);
'''); conn.commit()
display(pd.read_sql_query("SELECT * FROM gold_sales_kpis", conn))