In [0]:
from pyspark.sql import functions as F

CATALOG = "workspace"
SILVER_LISTINGS_TABLE = f"{CATALOG}.airbnb_silver.listings"

GOLD_SCHEMA        = f"{CATALOG}.airbnb_gold"
GOLD_CITY_DAILY    = f"{CATALOG}.airbnb_gold.city_daily_metrics"
GOLD_ROOM_TYPE     = f"{CATALOG}.airbnb_gold.room_type_distribution"
GOLD_NEIGHBORHOOD  = f"{CATALOG}.airbnb_gold.neighborhood_metrics"
GOLD_COMPARISON    = f"{CATALOG}.airbnb_gold.cross_city_comparison"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {GOLD_SCHEMA}")

silver = spark.table(SILVER_LISTINGS_TABLE)

def ensure_cols(df, cols_types):
    out = df
    for name, dtype in cols_types.items():
        if name not in out.columns:
            out = out.withColumn(name, F.lit(None).cast(dtype))
    return out

silver = ensure_cols(
    silver,
    {
        "pk_listing_id": "string",
        "last_scraped_dt": "date",
        "last_scraped": "string",         
        "calendar_last_scraped": "string",
        "price_clean": "double",
        "review_scores_rating_dbl": "double",
        "availability_30_int": "int",
        "room_type": "string",
        "neighbourhood_cleansed": "string",
        "host_id": "string",
        "city": "string",
    },
)

def parse_any_date(col):
    candidates = [
        F.to_date(F.to_timestamp(col, "yyyy-MM-dd")),
        F.to_date(F.to_timestamp(col, "MM/dd/yyyy")),
        F.to_date(F.to_timestamp(col, "dd/MM/yyyy")),
        F.to_date(F.to_timestamp(col, "yyyy/MM/dd")),
        F.to_date(col),
    ]
    expr = None
    for c in candidates:
        expr = c if expr is None else F.coalesce(expr, c)
    return expr

scrape_dt_raw = F.coalesce(
    F.col("last_scraped_dt"),
    parse_any_date(F.col("last_scraped")),
    parse_any_date(F.col("calendar_last_scraped")),
)

silver = silver.withColumn("scrape_dt_raw", scrape_dt_raw)

latest_per_city = (
    silver.groupBy("city")
          .agg(F.max("scrape_dt_raw").alias("scrape_dt_latest"))
          .withColumn(
              "scrape_dt_latest",
              F.coalesce(F.col("scrape_dt_latest"), F.current_date())
          )
)

silver_fixed = (
    silver.join(latest_per_city, on="city", how="left")
          .withColumn("scrape_dt_fixed", F.col("scrape_dt_latest").cast("date"))
)

# City Daily Metrics

city_daily_metrics = (
    silver_fixed
    .groupBy("city", "scrape_dt_fixed")
    .agg(
        F.countDistinct("pk_listing_id").alias("total_listings"),
        F.round(F.avg("price_clean"), 2).alias("avg_price_per_night"),
        F.round(F.avg("review_scores_rating_dbl"), 2).alias("avg_review_score"),
        F.round(F.avg(1 - (F.col("availability_30_int") / F.lit(30.0))), 4).alias("occupancy_rate_estimate"),
        F.countDistinct("host_id").alias("unique_hosts"),
    )
    .withColumnRenamed("scrape_dt_fixed", "scrape_dt")  # final column name
)

(city_daily_metrics.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_CITY_DAILY)
)

# Room Type Distribution
room_type_dist = (
    silver_fixed
    .groupBy("city", "scrape_dt_fixed", "room_type")
    .agg(F.countDistinct("pk_listing_id").alias("cnt"))
    .withColumnRenamed("scrape_dt_fixed", "scrape_dt")
)

room_type_total = (
    room_type_dist
    .groupBy("city", "scrape_dt")
    .agg(F.sum("cnt").alias("total_city_cnt"))
)

room_type_final = (
    room_type_dist
    .join(room_type_total, ["city", "scrape_dt"], "left")
    .withColumn(
        "share_of_city_listings",
        F.when(F.col("total_city_cnt") > 0, F.col("cnt") / F.col("total_city_cnt")).otherwise(F.lit(None))
    )
)

(room_type_final.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_ROOM_TYPE)
)

# Neighborhood Metrics
neighborhood_metrics = (
    silver_fixed
    .groupBy("city", "scrape_dt_fixed", "neighbourhood_cleansed")
    .agg(
        F.countDistinct("pk_listing_id").alias("listings_count"),
        F.round(F.avg("price_clean"), 2).alias("avg_price"),
        F.round(F.avg(1 - (F.col("availability_30_int")/F.lit(30.0))), 4).alias("occupancy_rate_estimate")
    )
    .withColumnRenamed("scrape_dt_fixed", "scrape_dt")
)

(neighborhood_metrics.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_NEIGHBORHOOD)
)

# Cross-City Comparison
(city_daily_metrics.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(GOLD_COMPARISON)
)

# Ð¡hecks
print("\nLatest date per city used (scrape_dt_latest):")
display(latest_per_city.orderBy("city"))

print("\nCompare Silver unique listings vs Gold total_listings at pinned date:")
silver_counts = silver.groupBy("city").agg(F.countDistinct("pk_listing_id").alias("silver_unique_listings"))
gold_counts   = spark.table(GOLD_CITY_DAILY).groupBy("city").agg(F.max("total_listings").alias("gold_total_listings"))
display(silver_counts.join(gold_counts, "city", "left").orderBy("city"))


In [0]:
from pyspark.sql import functions as F

gold_city_daily = spark.table("workspace.airbnb_gold.city_daily_metrics")

display(
    gold_city_daily
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price_per_night"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy_rate"),
        F.avg("avg_review_score").alias("avg_review_score"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)


In [0]:
display(
    gold_city_daily
    .withColumn("month", F.date_format("scrape_dt", "yyyy-MM"))
    .groupBy("city", "month")
    .agg(F.avg("avg_price_per_night").alias("avg_monthly_price"))
    .orderBy("city", "month")
)


In [0]:
gold_room_type = spark.table("workspace.airbnb_gold.room_type_distribution")

display(
    gold_room_type
    .groupBy("city", "room_type")
    .agg(F.avg("share_of_city_listings").alias("avg_share"))
    .orderBy("city", "avg_share", ascending=[True, False])
)


In [0]:
display(
    gold_city_daily
    .withColumn("month", F.date_format("scrape_dt", "yyyy-MM"))
    .groupBy("city", "month")
    .agg(
        F.avg("occupancy_rate_estimate").alias("avg_monthly_occupancy")
    )
    .orderBy("city", "month")
)


In [0]:
gold_neighborhood = spark.table("workspace.airbnb_gold.neighborhood_metrics")

display(
    gold_neighborhood
    .groupBy("city", "neighbourhood_cleansed")
    .agg(
        F.avg("avg_price").alias("avg_price"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy")
    )
    .orderBy("city", "avg_price", ascending=[True, False])
)


In [0]:
silver_listings = spark.table("workspace.airbnb_silver.listings")

display(
    silver_listings
    .groupBy("city")
    .agg(
        F.expr("percentile(review_scores_rating_dbl, array(0.1,0.5,0.9))")
         .alias("rating_percentiles"),
        F.avg("review_scores_rating_dbl").alias("avg_rating")
    )
)


In [0]:
cross = spark.table("workspace.airbnb_gold.cross_city_comparison")

display(
    cross
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price_per_night"),
        F.avg("avg_review_score").alias("avg_review_score"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy_rate"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)


In [0]:
metrics = (
    gold_city_daily
    .groupBy("city")
    .agg(
        F.max("total_listings").alias("total_listings"),
        F.avg("avg_price_per_night").alias("avg_price"),
        F.avg("occupancy_rate_estimate").alias("avg_occupancy"),
        F.avg("avg_review_score").alias("avg_rating"),
        F.max("unique_hosts").alias("host_count")
    )
    .orderBy("city")
)

metrics_pd = metrics.toPandas()
metrics_pd


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

gold_neighborhood = spark.table("workspace.airbnb_gold.neighborhood_metrics")

top_neighborhoods = (
    gold_neighborhood
    # rank neighborhoods by avg_price within each city
    .withColumn(
        "rank_price",
        F.row_number().over(
            Window.partitionBy("city").orderBy(F.col("avg_price").desc())
        )
    )
    .where(F.col("rank_price") <= 10)
    .select(
        "city",
        "neighbourhood_cleansed",
        "avg_price",
        "occupancy_rate_estimate",
        "listings_count",
        "scrape_dt"
    )
    .orderBy("city", "rank_price")
)

display(top_neighborhoods)


In [0]:
from pyspark.sql import functions as F

city_overview = (
    spark.table("workspace.airbnb_gold.city_daily_metrics")
    .select(
        "city",
        "scrape_dt",
        "total_listings",
        F.round("avg_price_per_night", 2).alias("avg_price_per_night"),
        F.round("occupancy_rate_estimate", 3).alias("avg_occupancy_rate"),
        F.round("avg_review_score", 2).alias("avg_review_score"),
        F.col("unique_hosts").alias("host_count")
    )
)

(
    city_overview
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("workspace.airbnb_gold.city_overview")
)
