In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, rand, current_timestamp, concat, lit
import os

# Ensure we use the root user to avoid HDFS permission errors
os.environ['HADOOP_USER_NAME'] = 'root'

spark = SparkSession.builder \
    .appName("spark-pipelline-template") \
    .config("spark.driver.host", "spark-notebook") \
    .config("spark.driver.bindAddress", "0.0.0.0") \
    .enableHiveSupport() \
    .getOrCreate()

# --- VERIFICATION ---
print("1. Testing Hive...")
try:
    spark.sql("SHOW DATABASES").show()
    print("✅ Hive Metastore is connected!")
except Exception as e:
    print(f"❌ Hive Error: {e}")

print("\n2. Testing ClickHouse Driver...")
try:
    spark._jvm.Class.forName("com.clickhouse.jdbc.ClickHouseDriver")
    print("✅ ClickHouse Driver is active!")
except Exception as e:
    print(f"❌ ClickHouse Error: {e}")

1. Testing Hive...
+---------+
|namespace|
+---------+
|  default|
|  test_db|
+---------+

✅ Hive Metastore is connected!

2. Testing ClickHouse Driver...
✅ ClickHouse Driver is active!


In [20]:
df = spark.sql('SELECT * FROM test_db.people WHERE id IS NOT NULL;')

In [26]:
from pyspark.sql import functions as F

df = df.withColumnRenamed("name","first_name")
df = df.withColumn("score_int", F.round(col("score")).cast("int"))

In [27]:
df.show()

+---+----------+---+-----+-------------+---------+
| id|first_name|age|score|         city|score_int|
+---+----------+---+-----+-------------+---------+
|  1|     Alice| 24| 88.5| Kuala Lumpur|       89|
|  2|       Bob| 31| 72.0|       Penang|       72|
|  3|   Charlie| 27| 91.2|  Johor Bahru|       91|
|  4|     Diana| 22| 65.4|       Melaka|       65|
|  5|     Ethan| 35| 78.9|         Ipoh|       79|
|  6|     Fiona| 29| 84.1|     Seremban|       84|
|  7|    George| 41| 69.8|      Kuantan|       70|
|  8|    Hannah| 26| 92.3|    Shah Alam|       92|
|  9|      Ivan| 33| 75.6|Petaling Jaya|       76|
+---+----------+---+-----+-------------+---------+

