In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, current_timestamp, concat, lit
import os

# Ensure we use the root user to avoid HDFS permission errors
os.environ['HADOOP_USER_NAME'] = 'root'

spark = SparkSession.builder \
    .appName("Hive-to-ClickHouse-Pipeline") \
    .config("spark.driver.host", "spark-notebook") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .enableHiveSupport() \
    .getOrCreate()

# --- VERIFICATION ---
print("1. Testing Hive...")
try:
    spark.sql("SHOW DATABASES").show()
    print("✅ Hive Metastore is connected!")
except Exception as e:
    print(f"❌ Hive Error: {e}")

print("\n2. Testing ClickHouse Driver...")
try:
    spark._jvm.Class.forName("com.clickhouse.jdbc.ClickHouseDriver")
    print("✅ ClickHouse Driver is active!")
except Exception as e:
    print(f"❌ ClickHouse Error: {e}")

1. Testing Hive...
+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+

✅ Hive Metastore is connected!

2. Testing ClickHouse Driver...
✅ ClickHouse Driver is active!


In [7]:
# 1. Generate Random Data (10,000 rows)
print("Generating random data...")
df = spark.range(0, 10000) \
    .withColumn("sensor_id", (rand() * 100).cast("int")) \
    .withColumn("reading_value", rand() * 50.0) \
    .withColumn("timestamp", current_timestamp()) \
    .withColumn("status", concat(lit("Status_"), (rand() * 5).cast("int")))

ch_url = "jdbc:ch://default:@analytics-clickhouse:8123/default"

# 2. We keep ONLY the driver and table options in the properties
ch_properties = {
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "createTableOptions": "ENGINE = MergeTree() ORDER BY (timestamp, sensor_id)"
}

print("Attempting write with Direct URL credentials...")
try:
    df.write.jdbc(
        url=ch_url, 
        table="random_sensor_data", 
        mode="overwrite", 
        properties=ch_properties
    )
    print("✅ Success! Data written to ClickHouse.")
except Exception as e:
    print(f"❌ Write failed again. Let's check the error code one more time:\n{e}")

Generating random data...
Attempting write with Direct URL credentials...
❌ Write failed again. Let's check the error code one more time:
An error occurred while calling o163.jdbc.
: java.sql.SQLException: Code: 516. DB::Exception: default: Authentication failed: password is incorrect, or there is no user with such name. If you have installed ClickHouse and forgot password you can reset it in the configuration file. The password for default user is typically located at /etc/clickhouse-server/users.d/default-password.xml and deleting this file will reset the password. See also /etc/clickhouse-server/users.xml on the server where ClickHouse is installed. . (AUTHENTICATION_FAILED) (version 24.8.14.39 (official build)) 
	at com.clickhouse.jdbc.internal.ExceptionUtils.toSqlState(ExceptionUtils.java:67)
	at com.clickhouse.jdbc.internal.ExceptionUtils.toSqlState(ExceptionUtils.java:42)
	at com.clickhouse.jdbc.StatementImpl.executeUpdateImpl(StatementImpl.java:225)
	at com.clickhouse.jdbc.Stat