In [0]:
import dlt
from pyspark.sql.functions import col, count, avg, round, sum, max, min, desc

In [0]:
@dlt.table(
    name="gold_sales_by_geography",
    comment="Aggregate KPIs for car sales volume and value by city.",
    table_properties={"pipelines.autoOptimize.zOrderCols": "city_name"}
)
def gold_sales_by_geography():
    df_fact = dlt.read("fact_transactions")
    df_geo = dlt.read("dim_geography")
    
    return (
        df_fact.join(df_geo, on="geo_id", how="inner")
        .groupBy("city_name")
        .agg(
            count("transaction_id").alias("total_listings"),
            round(avg("cost"), 2).alias("avg_car_price"),
            sum("cost").alias("total_market_value"),
            round(avg("probeg"), 0).alias("avg_mileage")
        )
    )

In [0]:
@dlt.table(
    name="gold_content_impact_analysis",
    partition_cols=["marka"],
    comment="KPIs showing how photo counts and descriptions relate to pricing."
)
def gold_content_impact_analysis():
    df_fact = dlt.read("fact_transactions")
    df_content = dlt.read("dim_car_content")
    
    return (
        df_fact.join(df_content, df_fact.transaction_id == df_content.car_id, "inner")
        .withColumn("has_photos", col("photo_count") > 0)
        .groupBy("marka", "has_photos")
        .agg(
            count("transaction_id").alias("listing_count"),
            round(avg("cost"), 2).alias("avg_listing_price"),
            avg("photo_count").alias("avg_photo_qty")
        )
    )

In [0]:
@dlt.table(
    name="gold_technical_valuation",
    comment="Valuation metrics categorized by engine and power specs.",
    table_properties={"pipelines.autoOptimize.zOrderCols": "engine"}
)
def gold_technical_valuation():
    return (
        dlt.read("fact_transactions")
        .join(dlt.read("dim_technical_specs"), on=["marka", "model"], how="inner")
        .groupBy("engine", "transmission")
        .agg(
            count("transaction_id").alias("vol_listings"),
            round(avg("power"), 0).alias("avg_horsepower"),
            round(avg("cost"), 2).alias("avg_market_price")
        )
        .filter("vol_listings > 5")
    )

In [0]:
@dlt.table(
    name="gold_sales_liquid_benchmark",
    comment="Liquid Clustering challenger table for performance benchmarking.",
    cluster_by=["city_name"]  # CORRECT: Use the cluster_by parameter directly
)
def gold_sales_liquid_benchmark():
    df_fact = dlt.read("fact_transactions")
    df_geo = dlt.read("dim_geography")
    
    return (
        df_fact.join(df_geo, on="geo_id", how="inner")
        .groupBy("city_name")
        .agg(
            count("transaction_id").alias("total_listings"),
            round(avg("cost"), 2).alias("avg_car_price"),
            sum("cost").alias("total_market_value"),
            # Ensure you are using 'probeg' as established in the last fix
            round(avg("probeg"), 0).alias("avg_mileage")
        )
    )