In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
@dlt.table(
    name="bronze_customers_dlt",
    comment="Bronze layer - Raw customer data with DLT"
)
def bronze_customers_dlt():
    return (
        spark.table("demo_project.bronze.source_customers")
        .withColumn("dlt_ingestion_timestamp", current_timestamp())
        .withColumn("dlt_ingestion_date", current_date())
    )

In [0]:
@dlt.table(
    name="silver_customers_dlt",
    comment="Silver layer - Clean data with quality checks"
)
@dlt.expect_or_drop("valid_customer_id", "customer_id IS NOT NULL")
@dlt.expect_or_drop("valid_email", "email RLIKE '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\\\.[A-Za-z]{2,}$'")
@dlt.expect("valid_plan", "subscription_plan IN ('Bronze', 'Silver', 'Gold')")
def silver_customers_dlt():
    return (
        dlt.read("bronze_customers_dlt")
        .select(
            "customer_id",
            "full_name",
            "email",
            "city",
            "subscription_plan",
            "account_status"
        )
        .dropDuplicates(["customer_id"])
    )

In [0]:
@dlt.table(
    name="gold_active_customers_dlt",
    comment="Gold layer - Active customers only"
)
def gold_active_customers_dlt():
    return (
        dlt.read("silver_customers_dlt")
        .filter("account_status = 'ACTIVE'")
    )

In [0]:
@dlt.table(
    name="gold_plan_metrics_dlt",
    comment="Gold layer - Plan analytics"
)
def gold_plan_metrics_dlt():
    return (
        dlt.read("silver_customers_dlt")
        .groupBy("subscription_plan")
        .agg(
            count("*").alias("total_customers"),
            sum(when(col("account_status") == "ACTIVE", 1).otherwise(0)).alias("active_customers")
        )
    )