In [1]:
import os
import sys
import glob
import logging
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
os.environ["HADOOP_USER_NAME"] = "root"

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

In [4]:
logger.info("Starting Spark Bronze Ingestion Job")

spark = (
    SparkSession.builder
    .appName("bronze-spark-upload-hdfs")
    .enableHiveSupport()
    .getOrCreate()
)

logger.info("Spark session initialized")

2026-01-01 03:10:33,502 | INFO | Starting Spark Bronze Ingestion Job
2026-01-01 03:10:40,444 | INFO | Spark session initialized


In [5]:
local_path = "file:///home/jovyan/work/bronze_layer/listings"
logger.info(f"Reading parquet data from {local_path}")

df_listings = (
    spark.read
    .option("mergeSchema", "true")
    .parquet(local_path)
)

row_count = df_listings.count()
logger.info(f"Loaded {row_count} rows")

2026-01-01 03:12:30,918 | INFO | Reading parquet data from file:///home/jovyan/work/bronze_layer/listings
2026-01-01 03:12:38,880 | INFO | Loaded 104652 rows


In [5]:
# local_path = "file:///home/jovyan/work/bronze_layer/listings/city=bristol"
# logger.info(f"Reading parquet data for Brsitol from {local_path}")

# bristol_listings = (
#     spark.read
#     .parquet(local_path)
# )

# row_count = bristol_listings.count()
# logger.info(f"Loaded {row_count} rows")

2025-12-31 05:38:28,513 | INFO | Reading parquet data for Brsitol from file:///home/jovyan/work/bronze_layer/listings/city=bristol
2025-12-31 05:38:36,844 | INFO | Loaded 8496 rows


In [6]:
# bristol_listings.select("extraction_date").distinct().show()

+---------------+
|extraction_date|
+---------------+
|     2025-03-19|
|     2025-06-24|
|     2025-09-26|
+---------------+



In [7]:
# local_path = "file:///home/jovyan/work/bronze_layer/listings/city=edinburgh"
# logger.info(f"Reading parquet data for Edinburgh from {local_path}")

# edinburgh_listings = (
#     spark.read
#     .parquet(local_path)
# )

# row_count = edinburgh_listings.count()
# logger.info(f"Loaded {row_count} rows")

2025-12-31 05:38:41,899 | INFO | Reading parquet data for Edinburgh from file:///home/jovyan/work/bronze_layer/listings/city=edinburgh
2025-12-31 05:38:42,756 | INFO | Loaded 16538 rows


In [8]:
# edinburgh_listings.select("extraction_date").distinct().show()

+---------------+
|extraction_date|
+---------------+
|     2025-06-15|
|     2025-03-08|
|     2025-09-21|
+---------------+



In [9]:
# local_path = "file:///home/jovyan/work/bronze_layer/listings/city=london"
# logger.info(f"Reading parquet data for London from {local_path}")

# london_listings = (
#     spark.read
#     .parquet(local_path)
# )

# row_count = london_listings.count()
# logger.info(f"Loaded {row_count} rows")

2025-12-31 05:38:45,589 | INFO | Reading parquet data for London from file:///home/jovyan/work/bronze_layer/listings/city=london
2025-12-31 05:38:46,663 | INFO | Loaded 288081 rows


In [10]:
# london_listings.select("extraction_date").distinct().show()

+---------------+
|extraction_date|
+---------------+
|     2025-03-04|
|     2025-06-10|
|     2025-09-14|
+---------------+



In [11]:
# print("Bristol columns:", len(bristol_listings.columns))
# print("Edinburgh columns:", len(edinburgh_listings.columns))
# print("London columns:", len(london_listings.columns))

Bristol columns: 80
Edinburgh columns: 80
London columns: 80


In [12]:
# bristol_listings = bristol_listings.withColumn("city", lit("bristol"))
# edinburgh_listings = edinburgh_listings.withColumn("city", lit("edinburgh"))
# london_listings = london_listings.withColumn("city", lit("london"))

In [13]:
# df_listings = (
#     bristol_listings
#     .unionByName(edinburgh_listings)
#     .unionByName(london_listings)
# )

In [14]:
# df_listings.count()

313115

In [15]:
# df_listings.groupBy("city").count().show()

+---------+------+
|     city| count|
+---------+------+
|  bristol|  8496|
|edinburgh| 16538|
|   london|288081|
+---------+------+



In [16]:
# df_listings.groupBy("extraction_date").count().show()

+---------------+-----+
|extraction_date|count|
+---------------+-----+
|     2025-03-19| 2772|
|     2025-06-24| 2879|
|     2025-09-26| 2845|
|     2025-06-15| 5936|
|     2025-03-08| 5666|
|     2025-09-21| 4936|
|     2025-03-04|94559|
|     2025-06-10|96651|
|     2025-09-14|96871|
+---------------+-----+



In [6]:
logger.info("Adding updated_at_utc_0 column")

df_listings = df_listings.withColumn(
    "updated_at_utc_0",
    F.current_timestamp()
)

2026-01-01 03:13:06,712 | INFO | Adding updated_at_utc_0 column


In [25]:
# df_listings.printSchema()

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: long (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: lo

In [19]:
# columns_to_cast = [
#     "minimum_minimum_nights",
#     "maximum_minimum_nights",
#     "minimum_maximum_nights",
#     "maximum_maximum_nights",
#     "host_listings_count"
# ]

# for c in columns_to_cast:
#     df_listings = df_listings.withColumn(c, col(c).cast("long"))

In [22]:
# df_listings = df_listings.withColumn("host_listings_count", col("host_listings_count").cast("long"))

In [15]:
hdfs_destination = "hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bilal/bronze/listings"
logger.info(f"Writing data to HDFS at {hdfs_destination}")

(
    df_listings.write
    .mode("overwrite")
    .partitionBy("city", "extraction_date")
    .format("parquet")
    .save(hdfs_destination)
)

logger.info("HDFS write completed successfully")

2026-01-01 03:28:25,456 | INFO | Writing data to HDFS at hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bilal/bronze/listings
2026-01-01 03:28:31,187 | INFO | HDFS write completed successfully


In [9]:
local_path = "file:///home/jovyan/work/bronze_layer/neighbourhoods"
logger.info(f"Reading parquet data from {local_path}")

df_neighbourhoods = (
    spark.read
    .option("mergeSchema", "true")
    .parquet(local_path)
)

row_count = df_neighbourhoods.count()
logger.info(f"Loaded {row_count} rows")

2026-01-01 03:19:22,776 | INFO | Reading parquet data from file:///home/jovyan/work/bronze_layer/neighbourhoods
2026-01-01 03:19:23,290 | INFO | Loaded 178 rows


In [11]:
logger.info("Adding updated_at_utc_0 column")

df_neighbourhoods = df_neighbourhoods.withColumn(
    "updated_at_utc_0",
    F.current_timestamp()
)

2026-01-01 03:20:45,941 | INFO | Adding updated_at_utc_0 column


In [16]:
hdfs_destination = "hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bilal/bronze/neighbourhoods"
logger.info(f"Writing data to HDFS at {hdfs_destination}")

(
    df_neighbourhoods.write
    .mode("overwrite")
    .partitionBy("city")
    .format("parquet")
    .save(hdfs_destination)
)

logger.info("HDFS write completed successfully")

2026-01-01 03:29:49,472 | INFO | Writing data to HDFS at hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bilal/bronze/neighbourhoods
2026-01-01 03:29:50,418 | INFO | HDFS write completed successfully


In [18]:
logger.info("Ensuring Hive database exists")

spark.sql("CREATE DATABASE IF NOT EXISTS airbnb_bilal_bronze")

2026-01-01 04:33:23,896 | INFO | Ensuring Hive database exists


AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient