In [0]:
CREATE WIDGET TEXT catalog DEFAULT 'first_phase';
CREATE WIDGET TEXT bronze_schema DEFAULT 'bronze';
CREATE WIDGET TEXT source_catalog DEFAULT 'samples';
CREATE WIDGET TEXT source_schema  DEFAULT 'tpch';

USE CATALOG ${catalog};
USE SCHEMA ${bronze_schema};


Basic check if the number of rows in the original data and data transfered to the bronze schema is he same

In [0]:
WITH src AS (
  SELECT 'customer' tbl, COUNT(*) n FROM ${source_catalog}.${source_schema}.customer UNION ALL
  SELECT 'orders', COUNT(*) FROM ${source_catalog}.${source_schema}.orders UNION ALL
  SELECT 'lineitem', COUNT(*) FROM ${source_catalog}.${source_schema}.lineitem UNION ALL
  SELECT 'nation', COUNT(*) FROM ${source_catalog}.${source_schema}.nation UNION ALL
  SELECT 'region', COUNT(*) FROM ${source_catalog}.${source_schema}.region UNION ALL
  SELECT 'part', COUNT(*) FROM ${source_catalog}.${source_schema}.part UNION ALL
  SELECT 'partsupp', COUNT(*) FROM ${source_catalog}.${source_schema}.partsupp UNION ALL
  SELECT 'supplier', COUNT(*) FROM ${source_catalog}.${source_schema}.supplier
),
brz AS (
  SELECT 'customer' tbl, COUNT(*) n FROM ${catalog}.${bronze_schema}.customer UNION ALL
  SELECT 'orders', COUNT(*) FROM ${catalog}.${bronze_schema}.orders UNION ALL
  SELECT 'lineitem', COUNT(*) FROM ${catalog}.${bronze_schema}.lineitem UNION ALL
  SELECT 'nation', COUNT(*) FROM ${catalog}.${bronze_schema}.nation UNION ALL
  SELECT 'region', COUNT(*) FROM ${catalog}.${bronze_schema}.region UNION ALL
  SELECT 'part', COUNT(*) FROM ${catalog}.${bronze_schema}.part UNION ALL
  SELECT 'partsupp', COUNT(*) FROM ${catalog}.${bronze_schema}.partsupp UNION ALL
  SELECT 'supplier', COUNT(*) FROM ${catalog}.${bronze_schema}.supplier
)
SELECT s.tbl, s.n AS original_source_rows, b.n AS bronze_rows, (s.n = b.n) AS counts_match
FROM src s JOIN brz b USING (tbl)
ORDER BY tbl;


Looking for missing or extra data in bronze schema or mismatch of types

In [0]:
-- look for missing data in bronze
SELECT 'not_in_bronze' AS issue,s.table_name, s.column_name, s.data_type
FROM ${source_catalog}.information_schema.columns s
LEFT JOIN ${catalog}.information_schema.columns b 
  ON b.table_schema = '${bronze_schema}'
  AND b.table_name = s.table_name
  AND b.column_name = s.column_name
WHERE s.table_schema = '${source_schema}' AND b.column_name IS NULL
ORDER BY s.table_name, s.ordinal_position;

In [0]:
-- look for extra data in bronze
SELECT 'extra_in_bronze' AS issue, b.table_name, b.column_name, b.data_type
FROM ${catalog}.information_schema.columns b
LEFT JOIN ${source_catalog}.information_schema.columns s
  ON s.table_schema = '${source_schema}'
  AND s.table_name = b.table_name
  AND s.column_name = b.column_name
WHERE b.table_schema = '${bronze_schema}' AND s.column_name IS NULL
ORDER BY b.table_name, b.ordinal_position;


In [0]:
-- look for the type mismatch
SELECT 'type_mismatch' AS issue, s.table_name, s.column_name, s.data_type AS src_type, b.data_type AS bronze_type
FROM ${source_catalog}.information_schema.columns s
JOIN ${catalog}.information_schema.columns b
  ON b.table_schema = '${bronze_schema}'
  AND b.table_name = s.table_name
  AND b.column_name = s.column_name
WHERE s.table_schema = '${source_schema}' AND s.data_type <> b.data_type
ORDER BY s.table_name, s.column_name;


Cheking if the primary keys are unique

In [0]:
WITH duplicate_results AS (
  SELECT 'customer' AS table_name, COUNT(*) AS duplicates FROM (
    SELECT c_custkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.customer GROUP BY c_custkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'orders', COUNT(*) FROM (
    SELECT o_orderkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.orders GROUP BY o_orderkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'lineitem', COUNT(*) FROM (
    SELECT l_orderkey, l_linenumber, COUNT(*) c FROM ${catalog}.${bronze_schema}.lineitem GROUP BY l_orderkey,l_linenumber HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'nation', COUNT(*) FROM (
    SELECT n_nationkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.nation GROUP BY n_nationkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'region', COUNT(*) FROM (
    SELECT r_regionkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.region GROUP BY r_regionkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'part', COUNT(*) FROM (
    SELECT p_partkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.part GROUP BY p_partkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'supplier', COUNT(*) FROM (
    SELECT s_suppkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.supplier GROUP BY s_suppkey HAVING COUNT(*)>1
  ) UNION ALL
  SELECT 'partsupp', COUNT(*) FROM (
    SELECT ps_partkey, ps_suppkey, COUNT(*) c FROM ${catalog}.${bronze_schema}.partsupp GROUP BY ps_partkey,ps_suppkey HAVING COUNT(*)>1
  )
)
SELECT * FROM duplicate_results ORDER BY table_name;



Checking for NULL values

In [0]:
SELECT tbl, col,
COUNT(1) AS rows,
SUM(CASE WHEN val IS NULL THEN 1 ELSE 0 END) AS nulls

FROM (
  SELECT 'customer' AS tbl, 'c_custkey' AS col, c_custkey AS val FROM ${catalog}.${bronze_schema}.customer UNION ALL 
  SELECT 'orders', 'o_orderkey', o_orderkey AS val FROM ${catalog}.${bronze_schema}.orders UNION ALL 
  SELECT 'lineitem', 'l_orderkey', l_orderkey AS val FROM ${catalog}.${bronze_schema}.lineitem UNION ALL 
  SELECT 'lineitem', 'l_quantity', CAST(l_quantity AS DOUBLE) AS val FROM ${catalog}.${bronze_schema}.lineitem UNION ALL 
  SELECT 'part', 'p_partkey', p_partkey AS val FROM ${catalog}.${bronze_schema}.part
) t
GROUP BY tbl, col
