# DLT pipeline

This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/data_quality_demo_pipeline.yml.

In [0]:
import dlt
import pyspark.sql.functions as F
from pyspark.sql.functions import expr

In [None]:
@dlt.view
def bronze_data_quality():
  return spark.readStream.table("data_quality_demo.bronze_data_quality")

In [None]:
@dlt.expect_or_drop("valid_sales_id", "sales_id IS NOT NULL")
@dlt.expect_or_drop("valid_client_id", "client_id IS NOT NULL")
@dlt.expect_or_drop("valid_product_id", "product_id IS NOT NULL")
@dlt.expect("valid_quantity", "quantity > 0")
@dlt.expect("valid_address", "address in ('New York', 'California', 'Texas', 'Florida', 'Illinois')")
@dlt.expect("valid_date", "date between '2020-01-01' and '2025-01-01'")
@dlt.table(
    name =f"_temp_bronze_clean",
    comment="The raw sales orders, ingested from /databricks-datasets.",
    table_properties={
        "myCompanyPipeline.quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    },
    temporary=True
)
#@dlt.expect_all_or_drop(get_rules(table_name))
def temp_silver():
    return (
        dlt.readStream("bronze_data_quality")
        .withColumn("process_timestamp", F.current_timestamp())
)

In [None]:
def create_silver(table_name, cols, ids, scd_type=1):
    dlt.create_streaming_table(name=f"silver_{table_name}_dlt", comment="Clean table: "+table_name)
    dlt.apply_changes(
    source = f"_temp_bronze_clean",
    target = f"silver_{table_name}_dlt",    
    keys = ids,
    sequence_by = F.col("process_timestamp"),
    column_list = cols,
    stored_as_scd_type = scd_type
    # apply_as_deletes = F.expr("operation = 'DELETE'"),
    #except_column_list = except_cols,
    )

In [None]:
client_silver_table = 'clients'
client_cols = ['client_id', 'first_name', 'last_name', 'gender', 'age', 'email', 'signup_date', 'address']
client_ids = ['client_id']

In [None]:
create_silver(client_silver_table, client_cols, client_ids, 2)

In [None]:
product_silver_table = 'products'
product_cols = ['product_id', 'product_name', 'category', 'price']
product_ids = ['product_id']

In [None]:
create_silver(product_silver_table, product_cols, product_ids)

In [None]:
sales_silver_table = 'sales'
sales_cols = ['date','sales_id', 'client_id', 'product_id', 'quantity','sale_amount']
sales_ids = ['sales_id']

In [None]:
create_silver(sales_silver_table, sales_cols, sales_ids)