# Geography Transformation

In [0]:
import dlt
from pyspark.sql.functions import col, expr, count, current_timestamp, trim
# --- 1. Silver: GEOGRAPHY ---
@dlt.table(
    name="silver_geography",
    comment="Cleaned lookup for city coordinates."
)
@dlt.expect_or_drop("valid_city_name", "city_name IS NOT NULL")
@dlt.expect_or_fail("valid_coords", "lat IS NOT NULL AND lon IS NOT NULL")
def silver_geography():
    return (
        dlt.read("bronze_geographic")
        .select(
            col("city_prepositional").alias("geo_id"), # Join Key
            col("city_name"),
            col("lat"),
            col("lon")
        ).dropDuplicates(["geo_id"])
    )

# Car Content (Enrichment)

In [0]:
# --- 2. Silver: CAR CONTENT ---
@dlt.table(
    name="silver_car_content",
    comment="Enriched car descriptions and photo counts."
)
def silver_car_content():
    df_text = dlt.read("bronze_texts")
    df_photos = (dlt.read("bronze_photos")
                 .groupBy("id")
                 .agg(count("photo_url").alias("photo_count")))
    
    return (df_text
            .join(df_photos, on="id", how="left")
            .select(
                col("id").alias("car_id"),
                col("text").alias("description"),
                col("photo_count")
            ))


# Technical Specs (Normalization)

In [0]:
# --- 3. Silver: TECHNICAL SPECS
@dlt.table(
    name="silver_technical_specs",
    comment="Standardized technical details for cars."
)
def silver_technical_specs():
    # Combining sources to get unique car specs
    return (
        dlt.read("bronze_transactions_unified")
        .select("marka", "model", "engine", "transmission", "power")
        .distinct()
    )

# Transaction Orchestration & Quarantine

In [0]:
# Define the shared validation rules
# We create a boolean expression that must be TRUE for valid data
valid_rules = "(transaction_id IS NOT NULL) AND (cost > 1000.0)"

# --- 4a. Silver: TRANSACTIONS (CLEAN DATA) ---
@dlt.table(
    name="silver_transactions",
    comment="Clean transactional car sales data."
)
@dlt.expect_or_fail("valid_schema", "transaction_id IS NOT NULL")
def silver_transactions():
    return (
        dlt.read("bronze_transactions_unified")
        .select(
            col("id").alias("transaction_id"),
            col("place").alias("geo_id"),
            "marka", "model", "year", "cost", "currency", 
            "has_license", "probeg", "load_timestamp"
        )
        # Only keep records that MEET the rules
        .filter(valid_rules)
    )

In [0]:
# --- 4b. QUARANTINE: TRANSACTIONS (FAILED DATA) ---
@dlt.table(
    name="quarantine_transactions",
    comment="Data that failed quality checks for cost, year, or ID."
)
def quarantine_transactions():
    return (
        dlt.read("bronze_transactions_unified")
        .select(
            col("id").alias("transaction_id"),
            col("place").alias("geo_id"),
            "marka", "model", "year", "cost", "currency", 
            "has_license", "probeg", "load_timestamp"
        )
        # Only keep records that FAIL the rules
        .filter(f"NOT ({valid_rules})")
        .withColumn("quarantine_reason", expr(f"""
            CASE 
                WHEN transaction_id IS NULL THEN 'Missing ID'
                WHEN cost <= 1000.0 THEN 'Price too low or invalid'
            END
        """))
    )

# Catalog Dimension

In [0]:
@dlt.table(
    name="silver_catalogs",
    comment="Cleaned and deduplicated catalog dimension with mandatory quality checks."
)
# Expectation 1: Drop records where the Primary Key components are missing
@dlt.expect_or_drop("valid_marka", "marka IS NOT NULL")
@dlt.expect_or_drop("valid_model", "model IS NOT NULL")
# Expectation 2: Ensure technical fields like body_type are present (Warning only)
@dlt.expect("valid_body_type", "body_type IS NOT NULL")
def silver_catalogs():
    return (
        dlt.read("bronze_catalogs")
        .select(
            # Standardizing and trimming strings for reliable joins
            trim(col("marka")).alias("marka"),
            trim(col("model")).alias("model"),
            "generation",
            "version",
            "body_type",
            "source_file",
            "load_timestamp"
        )
        # Deduplication is mandatory for dimensions in a Star Schema
        .dropDuplicates(["marka", "model", "generation", "version"])
    )