In [1]:
from pyspark.sql import SparkSession
import os

# Ensure we use the root user to avoid HDFS permission errors
os.environ['HADOOP_USER_NAME'] = 'root'

spark = SparkSession.builder \
    .appName("Hive-to-ClickHouse-Pipeline") \
    .config("spark.driver.host", "spark-notebook") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .enableHiveSupport() \
    .getOrCreate()

# --- VERIFICATION ---
print("1. Testing Hive...")
try:
    spark.sql("SHOW DATABASES").show()
    print("✅ Hive Metastore is connected!")
except Exception as e:
    print(f"❌ Hive Error: {e}")

print("\n2. Testing ClickHouse Driver...")
try:
    spark._jvm.Class.forName("com.clickhouse.jdbc.ClickHouseDriver")
    print("✅ ClickHouse Driver is active!")
except Exception as e:
    print(f"❌ ClickHouse Error: {e}")

1. Testing Hive...
+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+

✅ Hive Metastore is connected!

2. Testing ClickHouse Driver...
✅ ClickHouse Driver is active!
