In [0]:
import dlt
import pyspark.sql.functions as F 

In [0]:
@dlt.table(
    name="bronze_addresses",
    table_properties={"quality": "bronze"},
    comment="Raw address data from source cloud file",
)
def create_bronze_addresses():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")       
        .load("/Volumes/circuitbox/landing/operational_data/addresses")
    ).select(
        "*",
        F.col("_metadata.file_path").alias("input_file_path"),
        F.current_timestamp().alias("Ingest_timestamp"),
    )

In [0]:
@dlt.table(
    name="silver_addresses_clean",
    table_properties={"quality": "silver"},
    comment="Cleaned address data",
)
@dlt.expect_or_fail("valid_customer_id","customer_id IS NOT NULL")
@dlt.expect_or_drop("valid_address","address_line_1 IS NOT NULL")
@dlt.expect("valid_postcode","LENGTH(postcode) = 5")
def create_silver_addresses_clean():
    return (
        spark.readStream.table("LIVE.bronze_addresses")
        .select(
            "customer_id", 
            "address_line_1",
            "city",
            "state",
            "postcode",
            F.col("created_date").cast("date")
        )
    )


In [0]:
dlt.create_streaming_table(
    name="silver_addresses",
    comment="SCD Type 2 history table for addresses",
    table_properties={"quality": "silver"}
)

In [0]:
dlt.apply_changes(
    target="silver_addresses",
    source="silver_addresses_clean",
    keys=["customer_id"],
    sequence_by="created_date",
    stored_as_scd_type=2,
)