In [0]:
df_patient_raw = (
    spark.read.option("header", True)
        .option("inferSchema", True)
        .csv("dbfs:/FileStore/Mini_Project/Source_data/Patient_Source.csv")
)


In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

df_patient_landing = (
    df_patient_raw
    .withColumn("patient_id", trim(col("patient_id")))
    .withColumn("name", trim(col("name")))
    .withColumn("age", trim(col("age")).cast("int"))
    .withColumn("address", trim(col("address")))
    .withColumn("phone_number", regexp_replace(trim(col("phone_number")), "\\s+", ""))
    .withColumn("bill_amount", trim(col("bill_amount")).cast("double"))
    .withColumn("insurance_provider", trim(col("insurance_provider")))
    .withColumn("policy_id", trim(col("policy_id")))
    # Split name
    .withColumn("first_name", split(col("name"), " ").getItem(0))
    .withColumn("last_name", split(col("name"), " ").getItem(1))
    # MD5 business key
    .withColumn("primary_key", md5(concat_ws("|", col("patient_id"), col("name"))))
    # MD5 checksum of changeable columns
    .withColumn(
        "checksum_txt",
        md5(
            concat_ws(
                "|",
                col("address"),
                col("phone_number"),
                col("bill_amount"),
                col("insurance_provider"),
                col("policy_id")
            )
        )
    )
    .withColumn("current_timestamp", current_timestamp())
    .withColumn("updated_timestamp", current_timestamp())
    .withColumn("load_ctl_key", lit(2001)) 
)

df_patient_landing.createOrReplaceTempView("patient_landing_cleaned")


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW patient_landing_final AS
SELECT
    src.*,
    CASE
        WHEN dim.patient_id IS NULL THEN 'I'
        WHEN dim.checksum_txt <> src.checksum_txt THEN 'U'
        ELSE 'N'
    END AS transaction_ind
FROM patient_landing_cleaned src
LEFT JOIN patient_dim dim
    ON src.patient_id = dim.patient_id
   AND dim.is_current = 'Y';


In [0]:
df_final_patient = spark.sql("SELECT * FROM patient_landing_final")

df_final_patient.write.mode("overwrite").parquet(
    "dbfs:/FileStore/Mini_Project/landing/patient_landing"
)


In [0]:
%sql
-- Convert patient dim to Delta
CREATE OR REPLACE TABLE patient_dim
USING DELTA
AS SELECT * FROM parquet.`dbfs:/FileStore/Mini_Project/dim/patient_dim`;

-- Convert insurance dim to Delta
CREATE OR REPLACE TABLE insurance_dim
USING DELTA
AS SELECT * FROM parquet.`dbfs:/FileStore/Mini_Project/dim/insurance_dim`;


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, desc

windowSpec = Window.partitionBy("patient_id").orderBy(desc("current_timestamp"))

df_patient_dedup = (
    spark.table("patient_landing_final")
    .withColumn("rn", row_number().over(windowSpec))
    .filter(col("rn") == 1)
    .drop("rn")
)

df_patient_dedup.createOrReplaceTempView("patient_landing_final_dedup")


In [0]:
spark.sql("""MERGE INTO patient_dim AS dim
USING patient_landing_final_dedup AS src
ON dim.patient_id = src.patient_id
AND dim.is_current = 'Y'

WHEN MATCHED AND src.transaction_ind = 'U'
THEN UPDATE SET
    dim.is_current = 'N',
    dim.effective_end_dt = current_timestamp()

WHEN NOT MATCHED AND src.transaction_ind = 'U'
THEN INSERT (
      patient_dim_key,
      patient_id,
      first_name,
      last_name,
      age,
      address,
      phone_number,
      bill_amount,
      insurance_provider,
      policy_id,
      checksum_txt,
      load_ctl_key,
      effective_start_dt,
      effective_end_dt,
      is_current
)
VALUES (
      (SELECT COALESCE(MAX(patient_dim_key), 0) + 1 FROM patient_dim),
      src.patient_id,
      src.first_name,
      src.last_name,
      src.age,
      src.address,
      src.phone_number,
      src.bill_amount,
      src.insurance_provider,
      src.policy_id,
      src.checksum_txt,
      src.load_ctl_key,
      current_timestamp(),
      '9999-12-31',
      'Y'
)
WHEN NOT MATCHED AND src.transaction_ind = 'I'
THEN INSERT (
      patient_dim_key,
      patient_id,
      first_name,
      last_name,
      age,
      address,
      phone_number,
      bill_amount,
      insurance_provider,
      policy_id,
      checksum_txt,
      load_ctl_key,
      effective_start_dt,
      effective_end_dt,
      is_current
)
VALUES (
      (SELECT COALESCE(MAX(patient_dim_key), 0) + 1 FROM patient_dim),
      src.patient_id,
      src.first_name,
      src.last_name,
      src.age,
      src.address,
      src.phone_number,
      src.bill_amount,
      src.insurance_provider,
      src.policy_id,
      src.checksum_txt,
      src.load_ctl_key,
      current_timestamp(),
      '9999-12-31',
      'Y'
);""")

In [0]:
%sql
SELECT *
FROM patient_dim
WHERE is_current = 'Y'
ORDER BY patient_id;
