In [0]:
# 03_gold_aggregate

from pyspark.sql import functions as F

CATALOG = "workspace"
SILVER_LISTINGS_TABLE = f"{CATALOG}.airbnb_silver.listings"

GOLD_CITY_DAILY          = f"{CATALOG}.airbnb_gold.city_daily_metrics"
GOLD_ROOM_TYPE           = f"{CATALOG}.airbnb_gold.room_type_distribution"
GOLD_NEIGHBORHOOD        = f"{CATALOG}.airbnb_gold.neighborhood_metrics"
GOLD_COMPARISON          = f"{CATALOG}.airbnb_gold.cross_city_comparison"

# make sure gold schema exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.airbnb_gold")

silver_listings_all = spark.table(SILVER_LISTINGS_TABLE)

city_daily_metrics = (
    spark.table(SILVER_LISTINGS_TABLE)
    .groupBy("city", "last_scraped_dt")
    .agg(
        F.countDistinct("listing_id").alias("total_listings"),
        F.avg("price_clean").alias("avg_price_per_night"),
        F.avg("review_scores_rating_dbl").alias("avg_review_score"),
        F.avg(
            1 - (F.col("availability_30_int") / F.lit(30.0))
        ).alias("occupancy_rate_estimate"),
        F.countDistinct("host_id").alias("unique_hosts"),
    )
)


(
    city_daily_metrics
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_CITY_DAILY)
)

room_type_dist = (
    silver_listings_all
    .groupBy("city", "last_scraped_dt", "room_type")
    .agg(F.countDistinct("listing_id").alias("cnt"))
)
room_type_total = (
    room_type_dist
    .groupBy("city", "last_scraped_dt")
    .agg(F.sum("cnt").alias("total_city_cnt"))
)
room_type_final = (
    room_type_dist
    .join(room_type_total, ["city","last_scraped_dt"])
    .withColumn("share_of_city_listings", F.col("cnt")/F.col("total_city_cnt"))
)

(
    room_type_final
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_ROOM_TYPE)
)

neighborhood_metrics = (
    silver_listings_all
    .groupBy("city", "last_scraped_dt", "neighbourhood_cleansed")
    .agg(
        F.countDistinct("listing_id").alias("listings_count"),
        F.avg("price_clean").alias("avg_price"),
        F.avg(1 - (F.col("availability_30_int")/F.lit(30.0))).alias("occupancy_rate_estimate")
    )
)

(
    neighborhood_metrics
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_NEIGHBORHOOD)
)

(
    city_daily_metrics
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_COMPARISON)
)

display(
    spark.table(GOLD_CITY_DAILY)
         .groupBy("city")
         .agg(F.max("total_listings").alias("max_listings_seen"))
         .orderBy("city")
)


In [0]:
from pyspark.sql import functions as F

gold_city_daily = spark.table("workspace.airbnb_gold.city_daily_metrics")

display(
    gold_city_daily
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price_per_night"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy_rate"),
        F.avg("avg_review_score").alias("avg_review_score"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)


In [0]:
display(
    gold_city_daily
    .withColumn("month", F.date_format("last_scraped_dt", "yyyy-MM"))
    .groupBy("city", "month")
    .agg(F.avg("avg_price_per_night").alias("avg_monthly_price"))
    .orderBy("city", "month")
)


In [0]:
gold_room_type = spark.table("workspace.airbnb_gold.room_type_distribution")

display(
    gold_room_type
    .groupBy("city", "room_type")
    .agg(F.avg("share_of_city_listings").alias("avg_share"))
    .orderBy("city", "avg_share", ascending=[True, False])
)


In [0]:
display(
    gold_city_daily
    .withColumn("month", F.date_format("last_scraped_dt", "yyyy-MM"))
    .groupBy("city", "month")
    .agg(
        F.avg("occupancy_rate_estimate").alias("avg_monthly_occupancy")
    )
    .orderBy("city", "month")
)


In [0]:
gold_neighborhood = spark.table("workspace.airbnb_gold.neighborhood_metrics")

display(
    gold_neighborhood
    .groupBy("city", "neighbourhood_cleansed")
    .agg(
        F.avg("avg_price").alias("avg_price"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy")
    )
    .orderBy("city", "avg_price", ascending=[True, False])
)


In [0]:
silver_listings = spark.table("workspace.airbnb_silver.listings")

display(
    silver_listings
    .groupBy("city")
    .agg(
        F.expr("percentile(review_scores_rating_dbl, array(0.1,0.5,0.9))")
         .alias("rating_percentiles"),
        F.avg("review_scores_rating_dbl").alias("avg_rating")
    )
)


In [0]:
cross = spark.table("workspace.airbnb_gold.cross_city_comparison")

display(
    cross
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price_per_night"),
        F.avg("avg_review_score").alias("avg_review_score"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy_rate"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)


In [0]:
metrics = (
    gold_city_daily
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy"),
        F.avg("avg_review_score").alias("avg_rating"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)

metrics_pd = metrics.toPandas()
metrics_pd


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

gold_neighborhood = spark.table("workspace.airbnb_gold.neighborhood_metrics")

top_neighborhoods = (
    gold_neighborhood
    # rank neighborhoods by avg_price within each city
    .withColumn(
        "rank_price",
        F.row_number().over(
            Window.partitionBy("city").orderBy(F.col("avg_price").desc())
        )
    )
    .where(F.col("rank_price") <= 10)
    .select(
        "city",
        "neighbourhood_cleansed",
        "avg_price",
        "occupancy_rate_estimate",
        "listings_count",
        "last_scraped_dt"
    )
    .orderBy("city", "rank_price")
)

display(top_neighborhoods)


In [0]:
from pyspark.sql import functions as F

city_overview = (
    spark.table("workspace.airbnb_gold.city_daily_metrics")
    .select(
        "city",
        "last_scraped_dt",
        "total_listings",
        F.round("avg_price_per_night", 2).alias("avg_price_per_night"),
        F.round("occupancy_rate_estimate", 3).alias("avg_occupancy_rate"),
        F.round("avg_review_score", 2).alias("avg_review_score"),
        F.col("unique_hosts").alias("host_count")
    )
)

(
    city_overview
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("workspace.airbnb_gold.city_overview")
)
