In [0]:
%sql
USE workspace.healthcare_analytics;

In [0]:
from pyspark.sql.functions import col, to_date

spark.table("healthcare_analytics.bronze_patients") \
    .withColumn("birth_date", to_date(col("BIRTHDATE"))) \
    .withColumn("death_date", to_date(col("DEATHDATE"))) \
    .withColumn("healthcare_expenses", col("HEALTHCARE_EXPENSES").cast("double")) \
    .withColumn("healthcare_coverage", col("HEALTHCARE_COVERAGE").cast("double")) \
    .withColumn("income", col("INCOME").cast("double")) \
    .write.mode("overwrite").format("delta") \
    .saveAsTable("healthcare_analytics.silver_patients")

In [0]:
from pyspark.sql.functions import col, to_timestamp, to_date

(
    spark.table("healthcare_analytics.bronze_encounters")
    .withColumn("start_ts", to_timestamp(col("START")))
    .withColumn("stop_ts", to_timestamp(col("STOP")))
    .withColumn("encounter_date", to_date(col("START")))

    .withColumn("base_encounter_cost", col("BASE_ENCOUNTER_COST").cast("double"))
    .withColumn("total_claim_cost", col("TOTAL_CLAIM_COST").cast("double"))
    .withColumn("payer_coverage", col("PAYER_COVERAGE").cast("double"))


    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("healthcare_analytics.silver_encounters")
)

In [0]:
from pyspark.sql.functions import when, regexp_replace, to_timestamp

spark.table("healthcare_analytics.bronze_observations") \
    .withColumn("event_ts", to_timestamp(col("DATE"))) \
    .withColumn(
        "value_num",
        when(
            regexp_replace(col("VALUE"), r"[^0-9\.\-]", "")
            .rlike(r"^-?\d+(\.\d+)?$"),
            regexp_replace(col("VALUE"), r"[^0-9\.\-]", "").cast("double")
        )
    ) \
    .withColumn("value_text", when(col("value_num").isNull(), col("VALUE"))) \
    .drop("DATE","VALUE") \
    .write.mode("overwrite").format("delta") \
    .saveAsTable("healthcare_analytics.silver_observations")

In [0]:
spark.table("healthcare_analytics.bronze_providers") \
    .write.mode("overwrite").format("delta") \
    .saveAsTable("healthcare_analytics.silver_providers")

In [0]:
spark.table("healthcare_analytics.bronze_conditions") \
    .withColumn("condition_start", to_date(col("START"))) \
    .withColumn("condition_stop", to_date(col("STOP"))) \
    .drop("START","STOP") \
    .write.mode("overwrite").format("delta") \
    .saveAsTable("healthcare_analytics.silver_conditions")

In [0]:
spark.table("healthcare_analytics.silver_encounters").printSchema()

root
 |-- Id: string (nullable = true)
 |-- START: timestamp (nullable = true)
 |-- STOP: timestamp (nullable = true)
 |-- PATIENT: string (nullable = true)
 |-- ORGANIZATION: string (nullable = true)
 |-- PROVIDER: string (nullable = true)
 |-- PAYER: string (nullable = true)
 |-- ENCOUNTERCLASS: string (nullable = true)
 |-- CODE: long (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- base_encounter_cost: double (nullable = true)
 |-- total_claim_cost: double (nullable = true)
 |-- payer_coverage: double (nullable = true)
 |-- REASONCODE: long (nullable = true)
 |-- REASONDESCRIPTION: string (nullable = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- source_system: string (nullable = true)
 |-- start_ts: timestamp (nullable = true)
 |-- stop_ts: timestamp (nullable = true)
 |-- encounter_date: date (nullable = true)

