In [None]:
from pyspark.sql import SparkSession
import os 
import pandas as pd

# This forces Pandas to show every single row and column
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

os.environ['HADOOP_USER_NAME'] = 'root'

spark = SparkSession.builder \
    .appName("gold-dim_host") \
    .config("spark.driver.host", "spark-notebook") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .enableHiveSupport() \
    .getOrCreate()

In [None]:
df_dim_host = spark.sql(
    '''
    select distinct 
	host_id, 
	host_name, 
	DATE(host_since) as host_since,
	host_location,
	host_is_superhost,
	host_neighbourhood, 
	host_listings_count,
	host_total_listings_count,
	host_verifications,
	host_has_profile_pic,
	host_identity_verified,
	now() as updated_at_utc8
from airbnb_silver.stg_listings 
;
    '''
)

In [None]:
df_dim_host.printSchema()

In [None]:
ch_url = "jdbc:ch://analytics-clickhouse:8123/airbnb_gold?user=spark_admin&password=spark_123"

ch_properties = {
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "createTableOptions": "ENGINE = MergeTree() ORDER BY (host_id)"
}

print("Attempting write with spark_admin user...")
try:
    df_dim_host.write.jdbc(
        url=ch_url, 
        table="dim_hosts", 
        mode="overwrite", 
        properties=ch_properties
    )
    print("✅ Data loaded into ClickHouse.")
except Exception as e:
    print(f"❌ Error: {e}")