In [0]:
%sql
USE CATALOG hive_metastore;
USE SCHEMA default;


In [0]:
%run ./05_Logging_Utility


In [0]:
%python
from datetime import date
batch_id = date.today().strftime("%Y-%m-%d")
pipeline_name = "Patient Gold - Dimension"
layer = "Gold"
file_name = "patient_dim"

try:
    spark.sql("""
        CREATE OR REPLACE TABLE patient_dim
        USING DELTA
        LOCATION 's3a://mini-project-sathwik/gold/patient_dim'
        AS
        SELECT
            ROW_NUMBER() OVER (ORDER BY patient_id) AS patient_sk,
            patient_id,
            first_name,
            last_name,
            name,
            age,
            address,
            phone_number,
            bill_amount,
            insurance_provider,
            policy_id,
            start_date,
            end_date,
            CASE WHEN end_date = '9999-12-31' THEN 'Y' ELSE 'N' END AS is_current,
            activation_ind,
            ingest_time,
            source_file,
            create_timestamp,
            update_timestamp,
            load_ctl_key,
            checksum_txt
        FROM patient_final_silver
    """)
    row_count = spark.sql("""
        SELECT COUNT(*) AS cnt FROM patient_dim
    """).collect()[0]["cnt"]

    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="SUCCESS",
        row_count=row_count,
        message="Patient dimension gold table created successfully"
    )

except Exception as e:
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="FAILURE",
        row_count=0,
        message=str(e)
    )
    raise


In [0]:
create or replace view patient_dim_view AS
SELECT * FROM patient_dim;

In [0]:
select * from patient_dim_view;

In [0]:
%python
from datetime import date
batch_id = date.today().strftime("%Y-%m-%d")
pipeline_name = "Insurance Gold Load"
layer = "Gold"
file_name = "insurance_dim"

try:
    spark.sql("""
        CREATE OR REPLACE TABLE insurance_dim
        USING DELTA 
        AS
        SELECT
            -- Surrogate Key
            ROW_NUMBER() OVER (ORDER BY policy_id) AS insurance_sk,

            -- Business Key
            policy_id,

            -- Attributes
            insurance_provider,
            amount_covered,
            claim_status,

            -- SCD-2 Fields
            start_date,
            end_date,
            CASE WHEN end_date = '9999-12-31' THEN 'Y' ELSE 'N' END AS is_current,
            activation_ind,

            --Audit / Lineage
            ingest_time,
            source_file,
            create_timestamp,
            update_timestamp,
            load_ctl_key,
            checksum_txt
        FROM insurance_final_silver
    """)

    row_count = spark.table("insurance_dim").count()
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="SUCCESS",
        row_count=row_count,
        message="Insurance Gold table created successfully"
    )

except Exception as e:
    log_etl(
        pipeline_name=pipeline_name,
        batch_id=batch_id,
        file_name=file_name,
        layer=layer,
        status="FAILURE",
        row_count=0,
        message=str(e)
    )
    raise


In [0]:
create or replace view insurance_dim_view as 
select * from insurance_dim;

In [0]:
select * from insurance_dim_view;

In [0]:
CREATE DATABASE IF NOT EXISTS mini_proj_gold;

In [0]:
%python
df = spark.table("patient_dim")

df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_gold.patient_dim_accessible")


In [0]:
%python
df = spark.table("insurance_dim")

df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_gold.insurance_dim_accessible")


In [0]:
%sql
CREATE TABLE IF NOT EXISTS mini_proj_gold.patient_dim_metrics (
    batch_id STRING,
    table_name STRING,
    total_rows BIGINT
)
USING DELTA;

In [0]:
%sql
INSERT INTO mini_proj_gold.patient_dim_metrics
SELECT
    date_add(current_date(), 1)                       AS batch_id,          -- or pass your batch_id variable
    'patient_dim'                      AS table_name,
    COUNT(*)                              AS total_rows
FROM patient_dim_view;


In [0]:
%sql
CREATE TABLE IF NOT EXISTS mini_proj_gold.insurance_dim_metrics (
    batch_id STRING,
    table_name STRING,
    total_rows BIGINT
)
USING DELTA;

In [0]:
%sql
INSERT INTO mini_proj_gold.insurance_dim_metrics
SELECT
    date_add(current_date(), 1)                       AS batch_id,          -- or pass your batch_id variable
    'insurance_dim'                      AS table_name,
    COUNT(*)                              AS total_rows
FROM insurance_dim_view;


In [0]:
use mini_proj_logs

In [0]:
%python
df = spark.table("mini_proj_logs.etl_log")

df.write.format("delta").mode("overwrite").saveAsTable("mini_proj_logs.etl_log_accessible")


### # Now Let's create a few bussiness oriented tables

In [0]:
use catalog hive_metastore

In [0]:
CREATE OR REPLACE TABLE gold_claim_status_summary AS
SELECT
    claim_status,
    COUNT(*) AS total_claims,
    SUM(amount_covered) AS total_amount,
    ROUND(AVG(amount_covered), 2) AS avg_claim_amount
FROM insurance_dim_view
WHERE is_current = 'Y'
GROUP BY claim_status;


In [0]:
select * from gold_claim_status_summary

In [0]:
CREATE OR REPLACE TABLE gold_insurance_provider_perf AS
SELECT
    insurance_provider,
    COUNT(*) AS total_claims,
    SUM(amount_covered) AS total_amount,
    SUM(CASE WHEN claim_status = 'APPROVED' THEN 1 ELSE 0 END) AS approved_claims
FROM insurance_dim_view
WHERE is_current='Y'
GROUP BY insurance_provider;


In [0]:
select * from gold_insurance_provider_perf

In [0]:
CREATE OR REPLACE TABLE gold_patient_risk_profile AS
WITH patient_bill_agg AS (
    SELECT
        patient_id,
        name,
        age,
        SUM(bill_amount) AS total_bill_amount
    FROM patient_dim_view
    WHERE is_current = 'Y'
    GROUP BY patient_id, name, age
),
policy_coverage_agg AS (
    SELECT
        p.patient_id,
        SUM(i.amount_covered) AS total_covered_amount
    FROM patient_dim_view p
    LEFT JOIN insurance_dim_view i
        ON p.policy_id = i.policy_id
       AND i.is_current = 'Y'
    WHERE p.is_current = 'Y'
    GROUP BY p.patient_id
)
SELECT
    b.patient_id,
    b.name,
    b.age,
    b.total_bill_amount,
    COALESCE(c.total_covered_amount, 0) AS total_covered_amount,
    GREATEST(
        b.total_bill_amount - COALESCE(c.total_covered_amount, 0),
        0
    ) AS uncovered_amount
FROM patient_bill_agg b
LEFT JOIN policy_coverage_agg c
    ON b.patient_id = c.patient_id;


In [0]:
select * from gold_patient_risk_profile

In [0]:
CREATE OR REPLACE TABLE gold_age_group_claims AS
SELECT
    CASE
        WHEN p.age < 18 THEN '0-17'
        WHEN p.age BETWEEN 18 AND 30 THEN '18-30'
        WHEN p.age BETWEEN 31 AND 45 THEN '31-45'
        WHEN p.age BETWEEN 46 AND 60 THEN '46-60'
        ELSE '60+'
    END AS age_group,
    COUNT(DISTINCT p.patient_id) AS total_patients,
    COUNT(i.policy_id) AS total_claims,
    SUM(i.amount_covered) AS total_amount
FROM patient_dim_view p
LEFT JOIN insurance_dim_view i
    ON p.policy_id = i.policy_id
WHERE p.is_current = 'Y'
  AND i.is_current = 'Y'
GROUP BY age_group;


In [0]:
select * from gold_age_group_claims