In [0]:
# Databricks Notebook
# ---------------------------------------------------------
# Notebook 02: Clean, Transform and Prepare Silver Data
# ---------------------------------------------------------

from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# ---------------------------------------------
# Paths
# ---------------------------------------------
bronze_patient_path = "dbfs:/FileStore/Mini_Project/bronze/patient_bronze"
bronze_insurance_path = "dbfs:/FileStore/Mini_Project/bronze/insurance_bronze"

silver_patient_path = "dbfs:/FileStore/Mini_Project/silver/patient_silver"
silver_insurance_path = "dbfs:/FileStore/Mini_Project/silver/insurance_silver"

In [0]:
# ---------------------------------------------
# 1. Read Bronze Tables
# ---------------------------------------------
df_patient_raw = spark.read.format("delta").load(bronze_patient_path)
df_insurance_raw = spark.read.format("delta").load(bronze_insurance_path)

print("Bronze Patient Schema:")
df_patient_raw.printSchema()

In [0]:
# ---------------------------------------------
# 2. Clean + Transform Patient Data (Silver)
# ---------------------------------------------
df_patient_silver = (
    df_patient_raw
    .withColumn("patient_id", trim(col("patient_id")))
    .withColumn("name", trim(col("name")))
    .withColumn("age", col("age").cast("int"))
    .withColumn("address", trim(col("address")))
    .withColumn("phone_number", regexp_replace(trim(col("phone_number")), "\\s+", ""))
    .withColumn("bill_amount", col("bill_amount").cast("double"))
    .withColumn("insurance_provider", trim(col("insurance_provider")))
    .withColumn("policy_id", trim(col("policy_id")))

    # Split name into first_name and last_name
    .withColumn("first_name", split(col("name"), " ").getItem(0))
    .withColumn("last_name", split(col("name"), " ").getItem(1))

    # Business key (MD5)
    .withColumn("primary_key", md5(concat_ws("|", col("patient_id"), col("name"))))

    # Checksum of all changeable attributes
    .withColumn(
        "checksum_txt",
        md5(concat_ws("|",
            col("address"),
            col("age"),
            col("phone_number"),
            col("bill_amount"),
            col("insurance_provider"),
            col("policy_id")
        ))
    )

    .withColumn("create_timestamp", current_timestamp())
    .withColumn("update_timestamp", current_timestamp())
    .withColumn("load_ctl_key", date_format(current_timestamp(), "yyyyMMddHHmmss")
))

df_patient_silver.createOrReplaceTempView("patient_landing_cleaned")

print("Silver Patient Schema:")
df_patient_silver.printSchema()

In [0]:
# ---------------------------------------------
# 3. Write Silver Patient Delta Table
# ---------------------------------------------
dbutils.fs.rm(silver_patient_path, True)
df_patient_silver.write.mode("overwrite").format("delta").save(silver_patient_path)

In [0]:
spark.sql("DROP TABLE IF EXISTS patient_silver")
spark.sql("""
    CREATE TABLE IF NOT EXISTS patient_silver
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/silver/patient_silver'
""")

print("Silver Patient Table Created Successfully.")

In [0]:
# ---------------------------------------------
# 4. Insurance Silver (Simple Clean)
# ---------------------------------------------
from pyspark.sql.functions import (
    col, trim, current_timestamp, lit, date_format,
    md5, concat_ws
)

df_insurance_silver = (
    df_insurance_raw
        .withColumn("policy_id", trim(col("policy_id")))
        .withColumn("insurance_provider", trim(col("insurance_provider")))
        .withColumn("claim_status", trim(col("claim_status")))
        .withColumn("amount_covered", col("amount_covered").cast("double"))
        .withColumn("checksum_txt", md5(concat_ws("|",
            col("insurance_provider"),
            col("amount_covered"),
            col("claim_status")
        )))
        .withColumn("primary_key", md5(col("policy_id")))
        .withColumn("create_timestamp", current_timestamp())
        .withColumn("update_timestamp", lit("9999-12-31"))
        .withColumn("load_ctl_key", date_format(current_timestamp(), "yyyyMMddHHmmss"))
)

dbutils.fs.rm(silver_insurance_path, True)
df_insurance_silver.write.mode("overwrite").format("delta").save(silver_insurance_path)


In [0]:
spark.sql("DROP TABLE IF EXISTS insurance_silver")
spark.sql("""
    CREATE TABLE IF NOT EXISTS insurance_silver
    USING DELTA
    LOCATION 'dbfs:/FileStore/Mini_Project/silver/insurance_silver'
""")

print("Silver Insurance Table Created Successfully.")

In [0]:
%sql
-- ---------------------------------------------
-- 1. Load Silver Table into Temp View
-- ---------------------------------------------
CREATE OR REPLACE TEMP VIEW patient_silver AS
SELECT *
FROM delta.`dbfs:/FileStore/Mini_Project/silver/patient_silver`;

In [0]:
%sql
-- ---------------------------------------------
-- 1. Load Silver Table into Temp View
-- ---------------------------------------------
CREATE OR REPLACE TEMP VIEW insurance_silver AS
SELECT *
FROM delta.`dbfs:/FileStore/Mini_Project/silver/insurance_silver`;

In [0]:
%sql
CREATE TABLE if not exists patient_dim (
    patient_dim_key BIGINT GENERATED ALWAYS AS IDENTITY,  -- Delta auto-generates this
    patient_id STRING,
    first_name STRING,
    last_name STRING,
    age INT,
    address STRING,
    phone_number STRING,
    bill_amount DOUBLE,
    insurance_provider STRING,
    policy_id STRING,
    checksum_txt STRING,
    load_ctl_key STRING,
    effective_start_dt TIMESTAMP,
    effective_end_dt TIMESTAMP,
    is_current STRING
)
USING DELTA
LOCATION 'dbfs:/FileStore/Mini_Project/dim/patient_dim';


In [0]:
%sql
CREATE TABLE IF NOT EXISTS insurance_dim (
    insurance_dim_key BIGINT,        -- surrogate key (auto incremental or generated in ETL)
    policy_id STRING,                -- business key
    insurance_provider STRING,
    amount_covered DOUBLE,
    claim_status STRING,
    checksum_txt STRING,             -- detect changes
    load_ctl_key INT,                -- audit field
    effective_start_dt TIMESTAMP,    -- SCD2 validity start
    effective_end_dt TIMESTAMP,      -- SCD2 validity end
    is_current STRING                -- 'Y' or 'N'
)
USING DELTA
LOCATION 'dbfs:/FileStore/Mini_Project/dim/insurance_dim';


In [0]:
%sql
-- 3. Create Transaction Indicator (I/U/N)
-- ---------------------------------------------
CREATE OR REPLACE TEMP VIEW patient_silver_addTranscationindicator AS
SELECT
    src.*,
    CASE
        WHEN dim.patient_id IS NULL THEN 'I'
        WHEN dim.checksum_txt <> src.checksum_txt THEN 'U'
        ELSE 'N'
    END AS transaction_ind
FROM patient_silver src
LEFT JOIN patient_dim dim
    ON src.patient_id = dim.patient_id
    AND dim.is_current = 'Y';

In [0]:
%sql
-- 3. Create Transaction Indicator (I/U/N) for Insurance
-- -----------------------------------------------------
CREATE OR REPLACE global TEMP VIEW insurance_silver_addTranscationindicator AS
SELECT
    src.*,
    CASE
        WHEN dim.policy_id IS NULL THEN 'I'              
        WHEN dim.checksum_txt <> src.checksum_txt THEN 'U'  
        ELSE 'N'                                         
    END AS transaction_ind
FROM insurance_silver src
LEFT JOIN insurance_dim dim
    ON src.policy_id = dim.policy_id
    AND dim.is_current = 'Y';



In [0]:
%sql
-- ---------------------------------------------
-- 4. Deduplicate Landing Data (ROW_NUMBER logic)
-- ---------------------------------------------
CREATE OR REPLACE TEMP VIEW patient_final_silver AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY patient_id ORDER BY update_timestamp DESC) AS rn
    FROM patient_silver_addTranscationindicator
) t
WHERE rn = 1;

In [0]:
%sql
DROP TABLE IF EXISTS patient_final_silver;


In [0]:
%sql
-- ---------------------------------------------------
-- 2. Create Patient Final Silver as Delta Table
-- ---------------------------------------------------
CREATE OR REPLACE TABLE patient_final_silver
USING DELTA
LOCATION 'dbfs:/FileStore/Mini_Project/silver_final/patient_final_silver'
AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY patient_id ORDER BY load_ctl_key DESC) AS rn
    FROM patient_silver_addTranscationindicator
) t
WHERE rn = 1;


In [0]:
%sql
DROP TABLE IF EXISTS insurance_final_silver;


In [0]:
%sql
CREATE OR REPLACE TABLE insurance_final_silver
USING DELTA
LOCATION 'dbfs:/FileStore/Mini_Project/silver_final/insurance_final_silver'
AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY policy_id ORDER BY load_ctl_key DESC) AS rn
    FROM insurance_silver_addTranscationindicator
) t
WHERE rn = 1;


In [0]:
%sql
SELECT patient_id, transaction_ind, checksum_txt
FROM patient_silver_addTranscationindicator;


In [0]:
%sql
SELECT *
FROM patient_final_silver;


In [0]:
spark.sql("DESCRIBE patient_final_silver").show()