In [0]:
use catalog hive_metastore;

In [0]:
-- DROP ALL TABLES
DROP TABLE IF EXISTS patient_final_silver;
DROP TABLE IF EXISTS patient_dim;

DROP TABLE IF EXISTS insurance_final_silver;
DROP TABLE IF EXISTS insurance_dim;

DROP TABLE IF EXISTS mini_proj_logs.etl_log;


In [0]:
DROP TABLE IF EXISTS mini_proj_logs.etl_log;

In [0]:
%python
paths = [
    "s3a://mini-project-sathwik/bronze/patient_bronze",
    "s3a://mini-project-sathwik/bronze/insurance_bronze",
    
    "s3a://mini-project-sathwik/silver/patient_prefinal_silver",
    "s3a://mini-project-sathwik/silver/patient_final_silver",
    "s3a://mini-project-sathwik/gold/patient_dim",
    
    "s3a://mini-project-sathwik/silver/insurance_prefinal_silver",
    "s3a://mini-project-sathwik/silver/insurance_final_silver",
    "s3a://mini-project-sathwik/gold/insurance_dim",

    "s3a://mini-project-sathwik/logs/etl_log"
]

for p in paths:
    dbutils.fs.rm(p, recurse=True)


In [0]:
%python
dbutils.fs.ls("s3a://mini-project-sathwik/logs/")


In [0]:
-- Databricks Notebook
-- ---------------------------------------------------------
-- Notebook 04: Validation & Quality Checks for SCD-2 Pipeline
-- ---------------------------------------------------------

------------------------------------------------------------
-- 1. Basic Record Counts
------------------------------------------------------------
-- SELECT 'Patient Silver Count' AS metric, COUNT(*) AS value
-- FROM patient_silver;

SELECT 'Patient Dim Count (Gold)' AS metric, COUNT(*) AS value
FROM patient_dim;

In [0]:
------------------------------------------------------------
-- 2. Identify All Current Active Records
------------------------------------------------------------
SELECT *
FROM patient_dim
WHERE is_current = 'Y'
ORDER BY patient_id;

In [0]:
------------------------------------------------------------
-- 3. Full SCD History for Each Patient
------------------------------------------------------------
SELECT *
FROM patient_dim
ORDER BY patient_id, effective_start_dt;

In [0]:
------------------------------------------------------------
-- 4. Show Only Updated Versioned Records
------------------------------------------------------------
SELECT *
FROM patient_dim
WHERE is_current = 'N'
ORDER BY patient_id, effective_end_dt;

In [0]:
------------------------------------------------------------
-- 5. Validate SCD-2 Behavior:
--    Check if for every patient:
--    - Exactly one record has is_current = 'Y'
------------------------------------------------------------
SELECT patient_id, COUNT(*) AS total_versions,
SUM(CASE WHEN is_current = 'Y' THEN 1 ELSE 0 END) AS active_version_count
FROM patient_dim
GROUP BY patient_id
HAVING active_version_count != 1;

In [0]:

------------------------------------------------------------
-- 6. Check If Any Records Have Incorrect End Date
------------------------------------------------------------
SELECT *
FROM patient_dim
WHERE is_current = 'Y'
  AND effective_end_dt <> '9999-12-31'
ORDER BY patient_id;

In [0]:
------------------------------------------------------------
-- 8. Validate Surrogate Key Increment Logic
------------------------------------------------------------
SELECT patient_dim_key, patient_id, effective_start_dt
FROM patient_dim
ORDER BY patient_dim_key;


------------------------------------------------------------
--

In [0]:
SELECT policy_id, is_current, effective_end_dt
FROM insurance_dim
ORDER BY policy_id, effective_end_dt;


In [0]:
SHOW TABLES LIKE 'etl_log';


In [0]:
%python
spark.table("etl_log").show()


In [0]:
 %sql
SELECT * FROM etl_log ORDER BY log_id DESC;
