In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_gold")


In [0]:
display(
    spark.table("workspace.airbnb_bronze.listings_raw")
         .groupBy("city")
         .count()
)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

CATALOG = "workspace"
BRONZE_TABLE          = f"{CATALOG}.airbnb_bronze.listings_raw"
SILVER_LISTINGS_TABLE = f"{CATALOG}.airbnb_silver.listings"
SILVER_HOSTS_TABLE    = f"{CATALOG}.airbnb_silver.hosts"

# City param
try:
    dbutils.widgets.get("city")
except Exception:
    dbutils.widgets.text("city", "Paris")
city_name = dbutils.widgets.get("city")
print(f"[silver] Build for city = {city_name}")

# Load Bronze for the city
bronze_city = spark.table(BRONZE_TABLE).where(F.col("city") == city_name)
if bronze_city.limit(1).count() == 0:
    raise ValueError(f"No Bronze rows for city='{city_name}'.")

# Cleaning helper: case when we have an empty list in the string, which should be converted to NULL
def clean_empty_list_strings(df_in, cols):
    df_out = df_in
    for c in cols:
        if c in df_out.columns:
            df_out = df_out.withColumn(
                c,
                F.when(
                    F.col(c).isNull() |
                    (F.trim(F.col(c)) == "") |
                    (F.trim(F.col(c)) == "[]") |
                    (F.trim(F.col(c)) == "['']") |
                    (F.length(F.col(c)) <= 4),
                    F.lit(None)
                ).otherwise(F.col(c))
            )
    return df_out

df = clean_empty_list_strings(bronze_city, ["host_verifications", "amenities"])


df = df.withColumn("pk_listing_id", F.col("id").cast("string"))
if "ingestion_timestamp" not in df.columns:
    df = df.withColumn("ingestion_timestamp", F.current_timestamp())

num_regex = r'(\d+(\.\d+)?)'

# Getting bathrooms count per listing
if "bathrooms_text" in df.columns:
    df = df.withColumn(
        "bathrooms_clean",
        F.when(
            F.col("bathrooms_text").rlike("(?i)(half\\s*bath|shared\\s*half)"),
            F.lit(0.5)
        )
        .when(
            F.col("bathrooms_text").isNull() | (F.trim(F.col("bathrooms_text")) == ""),
            F.lit(None).cast("double")
        )
        .when(
            F.col("bathrooms_text").rlike(num_regex),
            F.regexp_extract("bathrooms_text", num_regex, 1).cast("double")
        )
        .otherwise(F.lit(None).cast("double"))
    )
else:
    df = df.withColumn(
        "bathrooms_clean",
        F.col("bathrooms").cast("double") if "bathrooms" in df.columns else F.lit(None).cast("double")
    )

# price_clean
if "price" in df.columns:
    tmp = F.regexp_replace(F.col("price"), ",", ".")
    tmp = F.regexp_replace(tmp, r"[^0-9\.]", "")
    df  = df.withColumn("price_number_str", F.regexp_extract(tmp, num_regex, 1))
    df  = df.withColumn("price_clean", F.col("price_number_str").cast("double"))
else:
    df  = df.withColumn("price_clean", F.lit(None).cast("double"))

# dates
df = df.withColumn("last_scraped_dt", F.to_date("last_scraped"))
df = df.withColumn("host_since_dt",   F.to_date("host_since"))

# numerics
for c in ["availability_30","availability_60","availability_90","availability_365"]:
    if c in df.columns:
        df = df.withColumn(f"{c}_int", F.col(c).cast("int"))

if "review_scores_rating" in df.columns:
    df = df.withColumn("review_scores_rating_dbl", F.col("review_scores_rating").cast("double"))

# geo
df = df.withColumn("lat_tmp", F.col("latitude").cast("double") if "latitude" in df.columns else F.lit(None).cast("double"))
df = df.withColumn("lon_tmp", F.col("longitude").cast("double") if "longitude" in df.columns else F.lit(None).cast("double"))

# Latest snapshot per listing (within city)
w = Window.partitionBy("pk_listing_id").orderBy(
    F.col("last_scraped_dt").desc_nulls_last(),
    F.col("ingestion_timestamp").desc_nulls_last()
)
latest = df.withColumn("rn", F.row_number().over(w)).where("rn = 1").drop("rn")

# Quality filters (optional but deterministic)
stage = latest
if stage.where(F.col("price_clean").isNotNull()).limit(1).count() > 0:
    stage = stage.where((F.col("price_clean") > 0) & (F.col("price_clean") < 5000))
if stage.where(F.col("lat_tmp").isNotNull() & F.col("lon_tmp").isNotNull()).limit(1).count() > 0:
    stage = stage.where(F.col("lat_tmp").isNotNull() & F.col("lon_tmp").isNotNull())

final_listings = stage.drop("price_number_str", "lat_tmp", "lon_tmp")

# Hosts aggregate from final_listings
hosts_city = None
if "host_id" in final_listings.columns:
    hosts_city = (
        final_listings
        .filter(F.col("host_id").isNotNull())
        .groupBy("city", "host_id")
        .agg(
            F.min("host_since_dt").alias("host_since_dt"),
            F.countDistinct("pk_listing_id").alias("listings_count"),
            F.first("host_name", ignorenulls=True).alias("host_name"),
            F.first("host_is_superhost", ignorenulls=True).alias("host_is_superhost")
        )
        .withColumn("host_since_year", F.year("host_since_dt"))
    )

final_listings.createOrReplaceTempView("silver_this_city_listings")
if hosts_city is not None:
    hosts_city.createOrReplaceTempView("silver_this_city_hosts")


In [0]:
if not spark.catalog.tableExists(SILVER_LISTINGS_TABLE):
    spark.sql(f"""
        CREATE TABLE {SILVER_LISTINGS_TABLE}
        USING DELTA
        PARTITIONED BY (city)
        AS SELECT * FROM silver_this_city_listings WHERE 1=0
    """)
else:
    details = spark.sql(f"DESCRIBE DETAIL {SILVER_LISTINGS_TABLE}").first().asDict()
    # Check partitioning
    part_cols = [r.col_name for r in spark.sql(f"DESCRIBE EXTENDED {SILVER_LISTINGS_TABLE}")
                 .where("col_name = 'Partitioning'").collect()]
    # Safer check via table properties:
    desc = spark.sql(f"DESCRIBE TABLE {SILVER_LISTINGS_TABLE}").collect()
    is_partitioned = any("Partition Columns" in r[0] and r[1] is not None for r in desc)
    if not is_partitioned:
        # Rebuild as partitioned by city preserving current data
        spark.sql(f"""
            CREATE OR REPLACE TABLE {SILVER_LISTINGS_TABLE}
            USING DELTA
            PARTITIONED BY (city)
            AS SELECT * FROM {SILVER_LISTINGS_TABLE}
        """)

if hosts_city is not None:
    if not spark.catalog.tableExists(SILVER_HOSTS_TABLE):
        spark.sql(f"""
            CREATE TABLE {SILVER_HOSTS_TABLE}
            USING DELTA
            PARTITIONED BY (city)
            AS SELECT * FROM silver_this_city_hosts WHERE 1=0
        """)
    else:
        desc = spark.sql(f"DESCRIBE TABLE {SILVER_HOSTS_TABLE}").collect()
        is_partitioned = any("Partition Columns" in r[0] and r[1] is not None for r in desc)
        if not is_partitioned:
            spark.sql(f"""
                CREATE OR REPLACE TABLE {SILVER_HOSTS_TABLE}
                USING DELTA
                PARTITIONED BY (city)
                AS SELECT * FROM {SILVER_HOSTS_TABLE}
            """)


In [0]:
from pyspark.sql import functions as F

def align_to_target(df_src, target_table, partition_col="city"):
    t_schema = spark.table(target_table).schema

    # Target fields
    pfield = next((f for f in t_schema if f.name == partition_col), None)
    if pfield is None:
        raise ValueError(f"Partition column '{partition_col}' not found in target table {target_table}.")

    nonpart_fields = [f for f in t_schema if f.name != partition_col]
    nonpart_cols   = [f.name for f in nonpart_fields]
    target_cols_set = set([f.name for f in t_schema])

    df = df_src

    # Add any missing target columns (with correct data types)
    src_cols = set(df.columns)
    for f in nonpart_fields:
        if f.name not in src_cols:
            df = df.withColumn(f.name, F.lit(None).cast(f.dataType))
    if partition_col not in src_cols:
        df = df.withColumn(partition_col, F.lit(None).cast(pfield.dataType))

    # Drop extra columns that are not in target schema
    drop_cols = [c for c in df.columns if c not in target_cols_set]
    for c in drop_cols:
        df = df.drop(c)

    # Cast to target data types
    for f in nonpart_fields:
        df = df.withColumn(f.name, F.col(f.name).cast(f.dataType))
    df = df.withColumn(partition_col, F.col(partition_col).cast(pfield.dataType))

    df = df.select(*(nonpart_cols + [partition_col]))
    return df, nonpart_cols
# listingd
aligned_listings_df, listings_nonpart_cols = align_to_target(
    final_listings, SILVER_LISTINGS_TABLE, partition_col="city"
)
aligned_listings_df.createOrReplaceTempView("silver_this_city_listings_aligned")

# hosts
hosts_nonpart_cols = None
if spark.catalog.tableExists(SILVER_HOSTS_TABLE) and "silver_this_city_hosts" in [t.name for t in spark.catalog.listTables()]:
    aligned_hosts_df, hosts_nonpart_cols = align_to_target(
        spark.table("silver_this_city_hosts"), SILVER_HOSTS_TABLE, partition_col="city"
    )
    aligned_hosts_df.createOrReplaceTempView("silver_this_city_hosts_aligned")


In [0]:
spark.sql(f"""
    INSERT OVERWRITE TABLE {SILVER_LISTINGS_TABLE}
    PARTITION (city = '{city_name}')
    SELECT {', '.join(listings_nonpart_cols)}
    FROM silver_this_city_listings_aligned
    WHERE city = '{city_name}'
""")

if hosts_nonpart_cols is not None:
    spark.sql(f"""
        INSERT OVERWRITE TABLE {SILVER_HOSTS_TABLE}
        PARTITION (city = '{city_name}')
        SELECT {', '.join(hosts_nonpart_cols)}
        FROM silver_this_city_hosts_aligned
        WHERE city = '{city_name}'
    """)


In [0]:
print("Row count by city per listing:")
city_counts_df = (
    spark.table(SILVER_LISTINGS_TABLE)
         .groupBy("city")
         .count()
         .orderBy("city")
)
display(city_counts_df)


In [0]:
from pyspark.sql import functions as F

silver = spark.table("workspace.airbnb_silver.listings")

display(
    silver.groupBy("city").agg(
        F.sum(F.col("host_verifications").isNull().cast("int")).alias("empty_host_verifications"),
        F.sum(F.col("amenities").isNull().cast("int")).alias("empty_amenities")
    )
)
