In [0]:
%sql

CREATE CATALOG IF NOT EXISTS some_catalog;
USE CATALOG some_catalog;

CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;


CREATE SCHEMA IF NOT EXISTS monitoring;

CREATE OR REPLACE TABLE some_catalog.bronze.customer AS
SELECT * FROM samples.tpch.customer;

CREATE OR REPLACE TABLE some_catalog.bronze.orders AS
SELECT * FROM samples.tpch.orders;

CREATE OR REPLACE TABLE some_catalog.bronze.lineitem AS
SELECT * FROM samples.tpch.lineitem;

CREATE OR REPLACE TABLE some_catalog.bronze.supplier AS
SELECT * FROM samples.tpch.supplier;

CREATE OR REPLACE TABLE some_catalog.bronze.part AS
SELECT * FROM samples.tpch.part;

CREATE OR REPLACE TABLE some_catalog.bronze.partsupp AS
SELECT * FROM samples.tpch.partsupp;

CREATE OR REPLACE TABLE some_catalog.bronze.nation AS
SELECT * FROM samples.tpch.nation;

CREATE OR REPLACE TABLE some_catalog.bronze.region AS
SELECT * FROM samples.tpch.region;




USE CATALOG some_catalog;


WITH s AS (
  SELECT 'customer' t, COUNT(*) c FROM samples.tpch.customer UNION ALL
  SELECT 'orders',   COUNT(*)   FROM samples.tpch.orders   UNION ALL
  SELECT 'lineitem', COUNT(*)   FROM samples.tpch.lineitem UNION ALL
  SELECT 'supplier', COUNT(*)   FROM samples.tpch.supplier UNION ALL
  SELECT 'part',     COUNT(*)   FROM samples.tpch.part     UNION ALL
  SELECT 'partsupp', COUNT(*)   FROM samples.tpch.partsupp UNION ALL
  SELECT 'nation',   COUNT(*)   FROM samples.tpch.nation   UNION ALL
  SELECT 'region',   COUNT(*)   FROM samples.tpch.region
),
b AS (
  SELECT 'customer' t, COUNT(*) c FROM bronze.customer UNION ALL
  SELECT 'orders',   COUNT(*)   FROM bronze.orders   UNION ALL
  SELECT 'lineitem', COUNT(*)   FROM bronze.lineitem UNION ALL
  SELECT 'supplier', COUNT(*)   FROM bronze.supplier UNION ALL
  SELECT 'part',     COUNT(*)   FROM bronze.part     UNION ALL
  SELECT 'partsupp', COUNT(*)   FROM bronze.partsupp UNION ALL
  SELECT 'nation',   COUNT(*)   FROM bronze.nation   UNION ALL
  SELECT 'region',   COUNT(*)   FROM bronze.region
)
SELECT s.t, s.c AS silver_cnt, b.c AS bronze_cnt, s.c - b.c AS diff
FROM s JOIN b USING (t)
WHERE s.c <> b.c;

USE CATALOG some_catalog;
USE SCHEMA silver;



USE CATALOG some_catalog;
CREATE SCHEMA IF NOT EXISTS silver;
USE SCHEMA silver;




CREATE OR REPLACE TABLE silver.customer AS
SELECT 
  c_custkey       AS customer_id,
  c_name          AS name,
  c_address       AS address,
  c_nationkey     AS nation_id,
  c_phone         AS phone,
  c_acctbal       AS account_balance,
  c_mktsegment    AS market_segment,
  c_comment       AS comment
FROM bronze.customer;

CREATE OR REPLACE TABLE silver.orders AS
SELECT
  o_orderkey      AS order_id,
  o_custkey       AS customer_id,
  o_orderstatus   AS status,
  o_totalprice    AS total_price,
  o_orderdate     AS order_date,
  o_orderpriority AS priority,
  o_clerk         AS clerk,
  o_shippriority  AS ship_priority,
  o_comment       AS comment
FROM bronze.orders;


CREATE OR REPLACE TABLE silver.lineitem AS
SELECT * FROM bronze.lineitem;


CREATE OR REPLACE TABLE silver.supplier AS
SELECT * FROM bronze.supplier;


CREATE OR REPLACE TABLE silver.part AS
SELECT * FROM bronze.part;


CREATE OR REPLACE TABLE silver.partsupp AS
SELECT * FROM bronze.partsupp;


CREATE OR REPLACE TABLE silver.nation AS
SELECT * FROM bronze.nation;

CREATE OR REPLACE TABLE silver.region AS
SELECT * FROM bronze.region;



WITH b AS (
  SELECT 'customer' t, COUNT(*) c FROM bronze.customer UNION ALL
  SELECT 'orders',   COUNT(*)   FROM bronze.orders   UNION ALL
  SELECT 'lineitem', COUNT(*)   FROM bronze.lineitem UNION ALL
  SELECT 'supplier', COUNT(*)   FROM bronze.supplier UNION ALL
  SELECT 'part',     COUNT(*)   FROM bronze.part     UNION ALL
  SELECT 'partsupp', COUNT(*)   FROM bronze.partsupp UNION ALL
  SELECT 'nation',   COUNT(*)   FROM bronze.nation   UNION ALL
  SELECT 'region',   COUNT(*)   FROM bronze.region
),
s AS (
  SELECT 'customer' t, COUNT(*) c FROM silver.customer UNION ALL
  SELECT 'orders',   COUNT(*)   FROM silver.orders   UNION ALL
  SELECT 'lineitem', COUNT(*)   FROM silver.lineitem UNION ALL
  SELECT 'supplier', COUNT(*)   FROM silver.supplier UNION ALL
  SELECT 'part',     COUNT(*)   FROM silver.part     UNION ALL
  SELECT 'partsupp', COUNT(*)   FROM silver.partsupp UNION ALL
  SELECT 'nation',   COUNT(*)   FROM silver.nation   UNION ALL
  SELECT 'region',   COUNT(*)   FROM silver.region
)
SELECT 
  b.t AS table_name,
  b.c AS bronze_count,
  s.c AS silver_count,
  (b.c - s.c) AS diff
FROM b
JOIN s USING (t)
WHERE b.c <> s.c;
USE CATALOG some_catalog;
CREATE SCHEMA IF NOT EXISTS monitoring;
USE SCHEMA monitoring;

CREATE TABLE IF NOT EXISTS monitoring.data_quality_log (
  run_timestamp TIMESTAMP,
  table_name STRING,
  bronze_count BIGINT,
  silver_count BIGINT,
  diff BIGINT,
  status STRING
);


INSERT INTO monitoring.data_quality_log
WITH bronze_ct AS (
  SELECT 'customer' AS t, COUNT(*) AS c FROM bronze.customer UNION ALL
  SELECT 'orders', COUNT(*) FROM bronze.orders UNION ALL
  SELECT 'lineitem', COUNT(*) FROM bronze.lineitem UNION ALL
  SELECT 'supplier', COUNT(*) FROM bronze.supplier UNION ALL
  SELECT 'part', COUNT(*) FROM bronze.part UNION ALL
  SELECT 'partsupp', COUNT(*) FROM bronze.partsupp UNION ALL
  SELECT 'nation', COUNT(*) FROM bronze.nation UNION ALL
  SELECT 'region', COUNT(*) FROM bronze.region
),
silver_ct AS (
  SELECT 'customer' AS t, COUNT(*) AS c FROM silver.customer UNION ALL
  SELECT 'orders', COUNT(*) FROM silver.orders UNION ALL
  SELECT 'lineitem', COUNT(*) FROM silver.lineitem UNION ALL
  SELECT 'supplier', COUNT(*) FROM silver.supplier UNION ALL
  SELECT 'part', COUNT(*) FROM silver.part UNION ALL
  SELECT 'partsupp', COUNT(*) FROM silver.partsupp UNION ALL
  SELECT 'nation', COUNT(*) FROM silver.nation UNION ALL
  SELECT 'region', COUNT(*) FROM silver.region
)
SELECT 
  current_timestamp() AS run_timestamp,
  b.t AS table_name,
  b.c AS bronze_count,
  s.c AS silver_count,
  (b.c - s.c) AS diff,
  CASE WHEN b.c = s.c THEN 'OK' ELSE 'MISMATCH' END AS status
FROM bronze_ct b
JOIN silver_ct s USING (t);

In [0]:
tables = [
    "customer", "orders", "lineitem", "supplier", "part", "partsupp", "nation", "region"
]

primary_keys = {
    "customer": ["c_custkey"],
    "orders": ["o_orderkey"],
    "lineitem": ["l_orderkey", "l_linenumber"],
    "supplier": ["s_suppkey"],
    "part": ["p_partkey"],
    "partsupp": ["ps_partkey", "ps_suppkey"],
    "nation": ["n_nationkey"],
    "region": ["r_regionkey"]
}

foreign_keys = {
    "orders": [("o_custkey", "customer", "c_custkey")],
    "lineitem": [
        ("l_orderkey", "orders", "o_orderkey"),
        ("l_partkey", "part", "p_partkey"),
        ("l_suppkey", "supplier", "s_suppkey")
    ],
    "customer": [("c_nationkey", "nation", "n_nationkey")],
    "supplier": [("s_nationkey", "nation", "n_nationkey")],
    "partsupp": [
        ("ps_partkey", "part", "p_partkey"),
        ("ps_suppkey", "supplier", "s_suppkey")
    ],
    "nation": [("n_regionkey", "region", "r_regionkey")]
}

schemas = ["bronze", "silver"]

for schema in schemas:
    for table in tables:
        pk_cols = primary_keys.get(table, [])
        if pk_cols:
            pk_name = f"pk_{table}"
            pk_cols_str = ", ".join(pk_cols)
            sql = f"""
            ALTER TABLE some_catalog.{schema}.{table}
            ADD CONSTRAINT {pk_name} PRIMARY KEY ({pk_cols_str}) ENFORCED
            """
           tables = [
    "customer", "orders", "lineitem", "supplier", "part", "partsupp", "nation", "region"
]

primary_keys = {
    "customer": ["c_custkey"],
    "orders": ["o_orderkey"],
    "lineitem": ["l_orderkey", "l_linenumber"],
    "supplier": ["s_suppkey"],
    "part": ["p_partkey"],
    "partsupp": ["ps_partkey", "ps_suppkey"],
    "nation": ["n_nationkey"],
    "region": ["r_regionkey"]
}

foreign_keys = {
    "orders": [("o_custkey", "customer", "c_custkey")],
    "lineitem": [
        ("l_orderkey", "orders", "o_orderkey"),
        ("l_partkey", "part", "p_partkey"),
        ("l_suppkey", "supplier", "s_suppkey")
    ],
    "customer": [("c_nationkey", "nation", "n_nationkey")],
    "supplier": [("s_nationkey", "nation", "n_nationkey")],
    "partsupp": [
        ("ps_partkey", "part", "p_partkey"),
        ("ps_suppkey", "supplier", "s_suppkey")
    ],
    "nation": [("n_regionkey", "region", "r_regionkey")]
}

schemas = ["bronze", "silver"]

for schema in schemas:
    for table in tables:
        pk_cols = primary_keys.get(table, [])
        if pk_cols:
            pk_name = f"pk_{table}"
            pk_cols_str = ", ".join(pk_cols)
            sql = f"""
            ALTER TABLE some_catalog.{schema}.{table}
            ADD CONSTRAINT {pk_name} PRIMARY KEY ({pk_cols_str})
            """
            spark.sql(sql)
        for fk in foreign_keys.get(table, []):
            fk_col, ref_table, ref_col = fk
            fk_name = f"fk_{table}_{fk_col}_{ref_table}_{ref_col}"
            sql = f"""
            ALTER TABLE some_catalog.{schema}.{table}
            ADD CONSTRAINT {fk_name} FOREIGN KEY ({fk_col}) REFERENCES some_catalog.{schema}.{ref_table}({ref_col})
            """
            spark.sql(sql)
        for fk in foreign_keys.get(table, []):
            fk_col, ref_table, ref_col = fk
            fk_name = f"fk_{table}_{fk_col}_{ref_table}_{ref_col}"
            sql = f"""
            ALTER TABLE some_catalog.{schema}.{table}
            ADD CONSTRAINT {fk_name} FOREIGN KEY ({fk_col}) REFERENCES some_catalog.{schema}.{ref_table}({ref_col}) ENFORCED
            """
            spark.sql(sql)