In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_bronze")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_silver")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.airbnb_gold")

In [0]:
path = "/Volumes/airbnb/airbnb_city_raw/city_data_volume/listings_paris.csv"

df_paris = (
    spark.read
    .option("header", "true")
    .option("multiLine", "true")
    .option("quote", '"')
    .option("escape", '"')
    .option("ignoreLeadingWhiteSpace", "true")
    .option("ignoreTrailingWhiteSpace", "true")
    .csv(path)
)
display(df_paris)


In [0]:
path = "/Volumes/airbnb/airbnb_city_raw/city_data_volume/listings_venice.csv"

df_venice = (
    spark.read
    .option("header", "true")
    .option("multiLine", "true")
    .option("quote", '"')
    .option("escape", '"')
    .option("ignoreLeadingWhiteSpace", "true")
    .option("ignoreTrailingWhiteSpace", "true")
    .csv(path)
)
display(df_venice)


In [0]:
from pyspark.sql import functions as F

# ✅ 1. Ensure both have a 'city' column (if not yet added)
if "city" not in df_paris.columns:
    df_paris = df_paris.withColumn("city", F.lit("Paris"))

if "city" not in df_venice.columns:
    df_venice = df_venice.withColumn("city", F.lit("Venice"))

# ✅ 2. Union both datasets
bronze_df = df_paris.unionByName(df_venice, allowMissingColumns=True)

# ✅ 3. Optional: clean & normalize
bronze_cleaned = (
    bronze_df
    .withColumn(
        "city",
        F.when(F.lower(F.col("city")) == "paris",  F.lit("Paris"))
         .when(F.lower(F.col("city")) == "venice", F.lit("Venice"))
         .otherwise(F.col("city"))
         .cast("string")
    )
    .dropDuplicates()
)

# ✅ 4. Write to Bronze layer (Delta table)
(
    bronze_cleaned.write
    .format("delta")
    .mode("overwrite")                 # or "append" if incremental
    .option("overwriteSchema", "true")
    .partitionBy("city")
    .saveAsTable("workspace.airbnb_bronze.listings_raw")
)



print("✅ Bronze table updated from preloaded DataFrames.")
display(
    spark.table("workspace.airbnb_bronze.listings_raw")
         .groupBy("city").count().orderBy("city")
)


In [0]:
%sql
DESCRIBE EXTENDED airbnb_bronze.listings_raw


In [0]:
%sql
DESCRIBE DETAIL airbnb_bronze.listings_raw;
-- or
DESCRIBE EXTENDED airbnb_bronze.listings_raw;


In [0]:
# from pyspark.sql import functions as F

# bronze_df = spark.table("airbnb_bronze.listings_raw")

# print("Before cleanup:")
# display(bronze_df.groupBy("city").count().orderBy("city"))

# bronze_cleaned = (
#     bronze_df
#     .withColumn(
#         "city",
#         F.when(F.lower("city") == "paris",  F.lit("Paris"))
#          .when(F.lower("city") == "venice", F.lit("Venice"))
#          .otherwise(F.col("city"))
#          .cast("string")
#     )
# )


# bronze_dedup = bronze_cleaned.dropDuplicates()

# (
#     bronze_dedup
#     .write
#     .format("delta")
#     .mode("overwrite")
#     .option("overwriteSchema", "true")
#     .partitionBy("city")
#     .saveAsTable("airbnb_bronze.listings_raw")
# )

# print("✅ Bronze table cleaned and normalized.")
# print("New partition breakdown:")
# display(
#     spark.table("airbnb_bronze.listings_raw")
#          .groupBy("city")
#          .count()
#          .orderBy("city")
# )
