In [1]:
from pyspark.sql import SparkSession
import os 
from pyspark.sql import functions as F

os.environ['HADOOP_USER_NAME'] = 'root'

spark = SparkSession.builder \
    .appName("bronze-spark-upload-hdfs") \
    .config("spark.driver.host", "spark-notebook") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
import os
import pandas as pd
from pyspark.sql import SparkSession
import glob

path = "/home/jovyan/work/bronze_layer/listings/**/*.parquet"
files = glob.glob(path, recursive=True)
print(files)

for f in files:
    file_path = os.path.join(path, f)
    # Read just the schema of each file individually
    temp_df = spark.read.parquet(f"file://{file_path}")
    col_count = len(temp_df.columns)
    price_type = dict(temp_df.dtypes).get('price')
    
    print(f"File: {f} | Columns: {col_count} | Price Type: {price_type}")

df_test = spark.read.parquet('file:///home/jovyan/work/bronze_layer/listings/city=london/extraction_date=2025-09-14/listings.parquet')
df_test.select(('price')).show()

['/home/jovyan/work/bronze_layer/listings/city=bristol/extraction_date=2025-09-26/listings.parquet', '/home/jovyan/work/bronze_layer/listings/city=edinburgh/extraction_date=2025-09-21/listings.parquet', '/home/jovyan/work/bronze_layer/listings/city=london/extraction_date=2025-09-14/listings.parquet']
File: /home/jovyan/work/bronze_layer/listings/city=bristol/extraction_date=2025-09-26/listings.parquet | Columns: 79 | Price Type: string
File: /home/jovyan/work/bronze_layer/listings/city=edinburgh/extraction_date=2025-09-21/listings.parquet | Columns: 79 | Price Type: string
File: /home/jovyan/work/bronze_layer/listings/city=london/extraction_date=2025-09-14/listings.parquet | Columns: 79 | Price Type: string
+-------+
|  price|
+-------+
| $70.00|
|$149.00|
|$411.00|
|   NULL|
|$210.00|
|$280.00|
| $90.00|
| $61.00|
|$340.00|
| $49.00|
|   NULL|
|$213.00|
|   NULL|
| $96.00|
|   NULL|
| $71.00|
|   NULL|
| $48.00|
|   NULL|
| $76.00|
+-------+
only showing top 20 rows



In [3]:
local_path = "file:///home/jovyan/work/bronze_layer/listings"

print(f"Reading data from {local_path}...")
# df_listings = spark.read.parquet(local_path).option("mergeSchema", "true")
df_listings = spark.read \
    .option("mergeSchema", "true") \
    .parquet("file:///home/jovyan/work/bronze_layer/listings")


Reading data from file:///home/jovyan/work/bronze_layer/listings...


In [4]:
df_listings.select("id", "name", "price", "city", "extraction_date").show()

+-----+--------------------+-------+------+---------------+
|   id|                name|  price|  city|extraction_date|
+-----+--------------------+-------+------+---------------+
|13913|Holiday London DB...| $70.00|london|     2025-09-14|
|15400|Bright Chelsea  A...|$149.00|london|     2025-09-14|
|17402|Very Central Mode...|$411.00|london|     2025-09-14|
|24328|Battersea live/wo...|   NULL|london|     2025-09-14|
|36274|Bright 1 bedroom ...|$210.00|london|     2025-09-14|
|36299|Kew Gardens 3BR h...|$280.00|london|     2025-09-14|
|36660|You are GUARANTEE...| $90.00|london|     2025-09-14|
|38605|SUNNY ROOM PRIVAT...| $61.00|london|     2025-09-14|
|38610|     Short Term Home|$340.00|london|     2025-09-14|
|38995|SPACIOUS ROOM IN ...| $49.00|london|     2025-09-14|
|39387|Stylish bedsit in...|   NULL|london|     2025-09-14|
|41445|2 Double bed apar...|$213.00|london|     2025-09-14|
|41509|Room in maisonett...|   NULL|london|     2025-09-14|
|41712|Room with a view,...| $96.00|lond

In [5]:
df_listings = df_listings.withColumn('updated_at_utc+0', F.current_timestamp())

In [6]:
hdfs_destination = "hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bronze/listings"

(df_listings.write
    .mode("overwrite") \
    .partitionBy("extraction_date", "city") \
    .format("parquet") \
    .save(hdfs_destination))

print(f"✅ Ingestion complete. Data stored at: {hdfs_destination}")

✅ Ingestion complete. Data stored at: hdfs://namenode:9000/user/hive/warehouse/airbnb.db/bronze/listings


In [7]:
# 1. Create the Database
spark.sql("CREATE DATABASE IF NOT EXISTS airbnb_bronze")

# 2. Create the Table
spark.sql('''
CREATE EXTERNAL TABLE IF NOT EXISTS airbnb_bronze.listings (
    id BIGINT,
    listing_url STRING,
    scrape_id BIGINT,
    last_scraped STRING,
    source STRING,
    name STRING,
    description STRING,
    neighborhood_overview STRING,
    picture_url STRING,
    host_id BIGINT,
    host_url STRING,
    host_name STRING,
    host_since STRING,
    host_location STRING,
    host_about STRING,
    host_response_time STRING,
    host_response_rate STRING,
    host_acceptance_rate STRING,
    host_is_superhost STRING,
    host_thumbnail_url STRING,
    host_picture_url STRING,
    host_neighbourhood STRING,
    host_listings_count DOUBLE,
    host_total_listings_count DOUBLE,
    host_verifications STRING,
    host_has_profile_pic STRING,
    host_identity_verified STRING,
    neighbourhood STRING,
    neighbourhood_cleansed STRING,
    neighbourhood_group_cleansed DOUBLE,
    latitude DOUBLE,
    longitude DOUBLE,
    property_type STRING,
    room_type STRING,
    accommodates BIGINT,
    bathrooms DOUBLE,
    bathrooms_text STRING,
    bedrooms DOUBLE,
    beds DOUBLE,
    amenities STRING,
    price STRING,
    minimum_nights BIGINT,
    maximum_nights BIGINT,
    minimum_minimum_nights DOUBLE,
    maximum_minimum_nights DOUBLE,
    minimum_maximum_nights DOUBLE,
    maximum_maximum_nights DOUBLE,
    minimum_nights_avg_ntm DOUBLE,
    maximum_nights_avg_ntm DOUBLE,
    calendar_updated DOUBLE,
    has_availability STRING,
    availability_30 BIGINT,
    availability_60 BIGINT,
    availability_90 BIGINT,
    availability_365 BIGINT,
    calendar_last_scraped STRING,
    number_of_reviews BIGINT,
    number_of_reviews_ltm BIGINT,
    number_of_reviews_l30d BIGINT,
    availability_eoy BIGINT,
    number_of_reviews_ly BIGINT,
    estimated_occupancy_l365d BIGINT,
    estimated_revenue_l365d DOUBLE,
    first_review STRING,
    last_review STRING,
    review_scores_rating DOUBLE,
    review_scores_accuracy DOUBLE,
    review_scores_cleanliness DOUBLE,
    review_scores_checkin DOUBLE,
    review_scores_communication DOUBLE,
    review_scores_location DOUBLE,
    review_scores_value DOUBLE,
    license STRING,
    instant_bookable STRING,
    calculated_host_listings_count BIGINT,
    calculated_host_listings_count_entire_homes BIGINT,
    calculated_host_listings_count_private_rooms BIGINT,
    calculated_host_listings_count_shared_rooms BIGINT,
    reviews_per_month DOUBLE
)
-- city and extraction_date move here!
PARTITIONED BY (extraction_date DATE, city STRING) 
STORED AS PARQUET
LOCATION '/user/hive/warehouse/airbnb.db/bronze/listings';
''')



AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient

In [7]:
# 3. Register the partitions (Crucial!)
spark.sql("MSCK REPAIR TABLE airbnb_bronze.listings")

DataFrame[]