In [0]:
from pyspark.sql import functions as F
from functools import reduce

for i in [
    "stock_calendar",
    "stock_listing",
    "stock_detailed_listing",
    "stock_detailed_reviews",
    "berlin_calendar",
    "berlin_listing",
    "berlin_detailed_listing",
    "berlin_detailed_reviews",
]:
    print(i)
    df = spark.table(
        "some_catalog.airbnb_data." + i
    )

    all_null_condition = reduce(
        lambda a, b: a & b,
        [F.col(c).isNull() for c in df.columns]
    )
    all_null_count = df.filter(
        all_null_condition
    ).count()
    df = df.filter(~all_null_condition)
    total_rows = df.count()
    summary = []
    for c in df.columns:
        nan_count = df.filter(
            F.col(c).isNull()
        ).count()
        percent_missing = (
            (nan_count / total_rows) * 100 if total_rows > 0 else 0
        )
        summary.append(
            (c, total_rows, nan_count, percent_missing)
        )
    summary_df = spark.createDataFrame(
        summary,
        ["column", "total_rows", "missing_count", "missing_percent"]
    )
    print(f"Rows with all columns null: {all_null_count}")
    display(summary_df)

In [0]:
from pyspark.sql import functions as F

berlin_listing_df = spark.table("some_catalog.airbnb_data.berlin_listing")
stock_listing_df = spark.table("some_catalog.airbnb_data.stock_listing")

berlin_avg_price = berlin_listing_df.filter(
    F.col("price").isNotNull()
).agg(
    F.avg("price").alias("avg_price")
).collect()[0]["avg_price"]

stock_avg_price = stock_listing_df.filter(
    F.col("price").isNotNull()
).agg(
    F.avg("price").alias("avg_price")
).collect()[0]["avg_price"]

print(f"Berlin listing average price: {berlin_avg_price}")
print(f"Stock listing average price: {stock_avg_price}")

In [0]:
from pyspark.sql import functions as F

berlin_calendar_df = spark.table("some_catalog.airbnb_data.berlin_calendar")
stock_calendar_df = spark.table("some_catalog.airbnb_data.stock_calendar")

berlin_occupancy_avg = berlin_calendar_df.select(
    (1 - F.when(F.col("available") == "t", 1)
         .when(F.col("available") == "f", 0)
         .otherwise(None)
         .cast("int")
    ).alias("occupied")
).agg(
    F.avg("occupied").alias("avg_occupancy")
)

stock_occupancy_avg = stock_calendar_df.select(
    (1 - F.when(F.col("available") == "t", 1)
         .when(F.col("available") == "f", 0)
         .otherwise(None)
         .cast("int")
    ).alias("occupied")
).agg(
    F.avg("occupied").alias("avg_occupancy")
)

display(berlin_occupancy_avg)
display(stock_occupancy_avg)