In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PatientInsuranceETL").getOrCreate()


In [0]:
from pyspark.sql.functions import trim, split, col, sha2, concat, current_timestamp, upper, regexp_replace, lit

# Load raw CSV
patient_raw = spark.read.csv("dbfs:/FileStore/Mini_Project/Source_data/Patient_Source.csv", header=True, inferSchema=True)

# Clean and transform
patient_landing = (
    patient_raw
    .withColumn("patient_id", trim(col("patient_id")))
    .withColumn("policy_id", trim(col("policy_id")))
    .withColumn("name", trim(col("name")))
    .withColumn("address", trim(col("address")))
    .withColumn("phone_number", regexp_replace(trim(col("phone_number")), " ", ""))  # remove spaces
    .withColumn("bill_amount", regexp_replace(trim(col("bill_amount")), " ", "").cast("double"))
    .withColumn("insurance_provider", upper(trim(col("insurance_provider"))))
    # Split full name
    .withColumn("first_name", split(col("name"), " ").getItem(0))
    .withColumn("last_name", split(col("name"), " ").getItem(1))
    # Generate primary key and checksum
    .withColumn("primary_key", sha2(concat(col("patient_id"), col("name")), 256))
    .withColumn("checksum_txt", sha2(concat(
        col("address"),
        col("phone_number"),
        col("bill_amount"),
        col("policy_id"),
        col("insurance_provider")
    ), 256))
    # Metadata columns
    .withColumn("created_at", current_timestamp())
    .withColumn("updated_at", current_timestamp())
    .withColumn("load_ctl_key", lit(1001))
    .withColumn("is_current", lit("Y"))
    .withColumn("txn_type", lit("I"))
)

# Save landing table to DBFS
patient_landing.write.mode("overwrite").parquet("dbfs:/FileStore/Mini_Project/landing/patient_landing")
patient_landing.show(5, truncate=False)


In [0]:
from pyspark.sql.functions import trim, upper, regexp_replace, sha2, concat, current_timestamp, lit

# Load insurance CSV
insurance_raw = spark.read.csv("dbfs:/FileStore/Mini_Project/Source_data/Insurance_Source.csv", header=True, inferSchema=True)

# Clean and transform
insurance_landing = (
    insurance_raw
    .withColumn("policy_id", trim(col("policy_id")))
    .withColumn("insurance_provider", upper(trim(col("insurance_provider"))))
    .withColumn("claim_status", upper(trim(col("claim_status"))))
    .withColumn("amount_covered", regexp_replace(trim(col("amount_covered")), " ", "").cast("double"))
    # Generate checksum for SCD2
    .withColumn("checksum_txt", sha2(concat(
        col("insurance_provider"),
        col("claim_status"),
        col("amount_covered")
    ), 256))
    # Metadata columns
    .withColumn("created_at", current_timestamp())
    .withColumn("updated_at", current_timestamp())
    .withColumn("load_ctl_key", lit(2001))
    .withColumn("is_current", lit("Y"))
    .withColumn("txn_type", lit("I"))
)

# Save landing table
insurance_landing.write.mode("overwrite").parquet("dbfs:/FileStore/Mini_Project/landing/insurance_landing")
insurance_landing.show(5, truncate=False)


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lit, current_timestamp

# Load cleaned patient landing table
patient_landing = spark.read.parquet("dbfs:/FileStore/Mini_Project/landing/patient_landing")

# Add SCD2 fields
patient_dim = (
    patient_landing
    .withColumn("effective_start_dt", current_timestamp())
    .withColumn("effective_end_dt", lit("9999-12-31"))
)

# Add surrogate key
window_patient = Window.orderBy("patient_id")
patient_dim = patient_dim.withColumn("patient_dim_key", row_number().over(window_patient))

# Save patient dimension
patient_dim.write.mode("overwrite").parquet("dbfs:/FileStore/Mini_Project/dim/patient_dim")

patient_dim.show(5, truncate=False)


In [0]:
# Load cleaned insurance landing table
insurance_landing = spark.read.parquet("dbfs:/FileStore/Mini_Project/landing/insurance_landing")

# Add SCD2 fields
insurance_dim = (
    insurance_landing
    .withColumn("effective_start_dt", current_timestamp())
    .withColumn("effective_end_dt", lit("9999-12-31"))
)

# Add surrogate key
window_insurance = Window.orderBy("policy_id")
insurance_dim = insurance_dim.withColumn("insurance_dim_key", row_number().over(window_insurance))

# Save insurance dimension
insurance_dim.write.mode("overwrite").parquet("dbfs:/FileStore/Mini_Project/dim/insurance_dim")

insurance_dim.show(5, truncate=False)


In [0]:
from pyspark.sql.functions import current_timestamp, col

# Load dim tables
patient_dim = spark.read.parquet("dbfs:/FileStore/Mini_Project/dim/patient_dim")
insurance_dim = spark.read.parquet("dbfs:/FileStore/Mini_Project/dim/insurance_dim")

# Create Fact table by joining on policy_id
fact_df = (
    patient_dim.alias("p")
    .join(
        insurance_dim.alias("i"),
        on="policy_id",
        how="left"
    )
    .select(
        col("p.patient_dim_key"),
        col("i.insurance_dim_key"),
        col("p.patient_id"),
        col("p.policy_id"),
        col("p.bill_amount"),
        col("i.amount_covered"),
        col("i.claim_status"),
        current_timestamp().alias("snapshot_date")
    )
)

# Save fact table
fact_df.write.mode("overwrite").parquet("dbfs:/FileStore/Mini_Project/fact/fact_patient_insurance")

fact_df.show(10, truncate=False)


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW patient_landing_view AS
SELECT * 
FROM parquet.`dbfs:/FileStore/Mini_Project/landing/patient_landing`;


In [0]:
%sql
-- Convert patient dim to Delta
CREATE OR REPLACE TABLE patient_dim
USING DELTA
AS SELECT * FROM parquet.`dbfs:/FileStore/Mini_Project/dim/patient_dim`;

-- Convert insurance dim to Delta
CREATE OR REPLACE TABLE insurance_dim
USING DELTA
AS SELECT * FROM parquet.`dbfs:/FileStore/Mini_Project/dim/insurance_dim`;


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW patient_landing_final AS
SELECT
    src.*,
    CASE
        WHEN dim.patient_id IS NULL THEN 'I'                               -- New record
        WHEN dim.checksum_txt <> src.checksum_txt THEN 'U'                 -- Updated record
        ELSE 'N'                                                           -- No change
    END AS transaction_ind
FROM patient_landing_view src
LEFT JOIN patient_dim dim
       ON src.patient_id = dim.patient_id
      AND dim.is_current = 'Y';


In [0]:
spark.sql("""MERGE INTO patient_dim AS dim
USING patient_landing_view AS src
ON dim.patient_id = src.patient_id
  AND dim.is_current = 'Y'  

WHEN MATCHED AND dim.checksum_txt <> src.checksum_txt THEN
  UPDATE SET
    dim.effective_end_dt = current_timestamp(),
    dim.is_current = 'N'

WHEN NOT MATCHED THEN
  INSERT (
      patient_dim_key,
      patient_id,
      first_name,
      last_name,
      age,
      address,
      phone_number,
      bill_amount,
      insurance_provider,
      policy_id,
      checksum_txt,
      load_ctl_key,
      effective_start_dt,
      effective_end_dt,
      is_current
  )
  VALUES (
      (SELECT COALESCE(MAX(patient_dim_key), 0) + 1 FROM patient_dim),
      src.patient_id,
      src.first_name,
      src.last_name,
      src.age,
      src.address,
      src.phone_number,
      src.bill_amount,
      src.insurance_provider,
      src.policy_id,
      src.checksum_txt,
      src.load_ctl_key,
      current_timestamp(),
      '9999-12-31',
      'Y'
  );""")
