In [0]:
import dlt
from pyspark.sql.functions import col, expr, count, current_timestamp

In [0]:
# --- 1. DIMENSION: GEOGRAPHY ---
@dlt.table(
    name="dim_geography",
    comment="Cleaned lookup for city coordinates."
)
@dlt.expect_or_drop("valid_city_name", "city_name IS NOT NULL")
@dlt.expect_or_fail("valid_coords", "lat IS NOT NULL AND lon IS NOT NULL")
def dim_geography():
    return (
        dlt.read("bronze_geographic")
        .select(
            col("city_prepositional").alias("geo_id"), # Join Key
            col("city_name"),
            col("lat"),
            col("lon")
        ).distinct()
    )

In [0]:

# --- 2. DIMENSION: CAR CONTENT ---
@dlt.table(
    name="dim_car_content",
    comment="Enriched car descriptions and photo counts."
)
def dim_car_content():
    df_text = dlt.read("bronze_texts")
    df_photos = (dlt.read("bronze_photos")
                 .groupBy("id")
                 .agg(count("photo_url").alias("photo_count")))
    
    return (df_text
            .join(df_photos, on="id", how="left")
            .select(
                col("id").alias("car_id"),
                col("text").alias("description"),
                col("photo_count")
            ))


In [0]:
# --- 3. DIMENSION: TECHNICAL SPECS (Optional/Derived) ---
@dlt.table(
    name="dim_technical_specs",
    comment="Standardized technical details for cars."
)
def dim_technical_specs():
    # Combining sources to get unique car specs
    return (
        dlt.read("bronze_transactions_unified")
        .select("marka", "model", "engine", "transmission", "power")
        .distinct()
    )

In [0]:
# Define the shared validation rules
# We create a boolean expression that must be TRUE for valid data
valid_rules = "(transaction_id IS NOT NULL) AND (cost > 1000.0)"

# --- 4a. FACT: TRANSACTIONS (CLEAN DATA) ---
@dlt.table(
    name="fact_transactions",
    comment="Clean transactional car sales data."
)
@dlt.expect_or_fail("valid_schema", "transaction_id IS NOT NULL")
def fact_transactions():
    return (
        dlt.read("bronze_transactions_unified")
        .select(
            col("id").alias("transaction_id"),
            col("place").alias("geo_id"),
            "marka", "model", "year", "cost", "currency", 
            "has_license", "probeg", "load_timestamp"
        )
        # Only keep records that MEET the rules
        .filter(valid_rules)
    )

# --- 4b. QUARANTINE: TRANSACTIONS (FAILED DATA) ---
@dlt.table(
    name="quarantine_transactions",
    comment="Data that failed quality checks for cost, year, or ID."
)
def quarantine_transactions():
    return (
        dlt.read("bronze_transactions_unified")
        .select(
            col("id").alias("transaction_id"),
            col("place").alias("geo_id"),
            "marka", "model", "year", "cost", "currency", 
            "has_license", "probeg", "load_timestamp"
        )
        # Only keep records that FAIL the rules
        .filter(f"NOT ({valid_rules})")
        .withColumn("quarantine_reason", expr(f"""
            CASE 
                WHEN transaction_id IS NULL THEN 'Missing ID'
                WHEN cost <= 1000.0 THEN 'Price too low or invalid'
            END
        """))
    )