In [0]:
from pyspark.sql.functions import (
    col, sum as _sum, countDistinct, date_trunc
)

spark.conf.set(
    "fs.azure.account.key.healthcarestoragerk.dfs.core.windows.net",
    "xxxxx"
)

silver_base = "abfss://silver@healthcarestoragerk.dfs.core.windows.net"
gold_base   = "abfss://gold@healthcarestoragerk.dfs.core.windows.net"

fact_path      = f"{silver_base}/fact_encounter"
dim_org_path   = f"{silver_base}/dim_organization"

fact = spark.read.format("delta").load(fact_path)
dim_org = spark.read.format("delta").load(dim_org_path)

# 1) Encounters & revenue by organization per month

org_month = (
    fact
    .withColumn("month", date_trunc("month", col("admission_time")))
    .groupBy("organization_id", "organization_name", "organization_state", "month")
    .agg(
        countDistinct("encounter_id").alias("encounter_count"),
        _sum("total_claim_cost").alias("total_claim_cost"),
        _sum("total_payments").alias("total_payments")
    )
)

org_month_path = f"{gold_base}/encounters_by_org_month"
(
    org_month.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(org_month_path)
)
print(f"Gold: encounters_by_org_month -> {org_month_path}")

# 2) Encounters by department

dept_stats = (
    fact
    .groupBy("department")
    .agg(
        countDistinct("encounter_id").alias("encounter_count"),
        _sum("total_claim_cost").alias("total_claim_cost"),
        _sum("total_payments").alias("total_payments")
    )
)

dept_path = f"{gold_base}/encounters_by_department"
(
    dept_stats.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(dept_path)
)
print(f"Gold: encounters_by_department -> {dept_path}")

# 3) Payer coverage summary

payer_stats = (
    fact
    .groupBy("payer_id")
    .agg(
        countDistinct("encounter_id").alias("encounter_count"),
        _sum("total_claim_cost").alias("total_claim_cost"),
        _sum("payer_coverage").alias("total_payer_coverage"),
        _sum("total_payments").alias("total_payments")
    )
)

payer_path = f"{gold_base}/payer_coverage_summary"
(
    payer_stats.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .save(payer_path)
)
print(f"Gold: payer_coverage_summary -> {payer_path}")

print("\nGold marts built successfully")


Gold: encounters_by_org_month -> abfss://gold@healthcarestoragerk.dfs.core.windows.net/encounters_by_org_month
Gold: encounters_by_department -> abfss://gold@healthcarestoragerk.dfs.core.windows.net/encounters_by_department
Gold: payer_coverage_summary -> abfss://gold@healthcarestoragerk.dfs.core.windows.net/payer_coverage_summary

Gold marts built successfully


In [0]:
display(spark.read.format("delta").load("abfss://gold@healthcarestoragerk.dfs.core.windows.net/encounters_by_org_month"))


organization_id,organization_name,organization_state,month,encounter_count,total_claim_cost,total_payments
6352f08d-6323-3a16-bfb5-205ac0332cf3,PCP21854,MA,1992-12-01T00:00:00Z,1,926.85,129.16
37b4d73f-652d-3033-a16e-d97b9e8b4cda,ANNA JAQUES HOSPITAL,MA,2018-09-01T00:00:00Z,3,38589.81,35690.77
9a01482a-dc55-3916-9fe9-016717fe40a4,PCP353491,MA,2019-04-01T00:00:00Z,1,6100.85,1483.71
a0b6ec0c-e587-3b2a-bf9f-248849f29ee5,ST ELIZABETH'S MEDICAL CENTER,MA,2020-04-01T00:00:00Z,1,10202.59,10202.59
607ea852-d7e7-39c7-9a61-960e6b74136a,PCP183131,MA,1988-12-01T00:00:00Z,1,3931.65,689.5799999999999
4861d01f-019c-3dac-a153-8334e50919f9,NORTH SHORE MEDICAL CENTER -,MA,1987-12-01T00:00:00Z,1,258.32,204.16
ccece44c-904d-3542-8de8-ac8618edfbc6,PCP20907,MA,1988-09-01T00:00:00Z,1,786.33,129.16
b06770b5-b695-3c1a-9685-ae2223966ff7,GENERAL HEALTH CARE,MA,2016-05-01T00:00:00Z,1,269.68,269.68
2cacbb8e-b655-3f0c-b6a4-685f666e4544,PCP11360,MA,1986-04-01T00:00:00Z,1,2780.55,129.2
ae14cb98-56a5-3fb9-bc6f-d3d9949e958c,PCP80958,MA,2018-03-01T00:00:00Z,1,2358.99,786.3599999999999
