In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import Row

# constants for connection
JDBC_CLICKHOUSE_JAR_FILENAME = 'clickhouse-jdbc-0.6.0-patch3-all'
CLICKHOUSE_HOST_NAME = 'localhost'
CLICKHOUSE_PORT=8123
CLICKHOUSE_DBNAME='default'
CLICKHOUSE_USERNAME = 'clickhouse_user'
CLICKHOUSE_PASSWORD = 'clickhouse_password'
TABLE_NAME = 'test_table'

In [2]:
# the Spark session should be instantiated as follows
spark = SparkSession \
    .builder \
    .appName("Python Spark ClickHouse basic example") \
    .master("local") \
    .config("spark.jars", f"{JDBC_CLICKHOUSE_JAR_FILENAME}.jar") \
    .getOrCreate()

In [3]:
# JDBC URL for connection to ClickHouse
jdbc_url = f"jdbc:clickhouse://{CLICKHOUSE_HOST_NAME}:{CLICKHOUSE_PORT}/{CLICKHOUSE_DBNAME}"
jdbc_properties = {
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "user": CLICKHOUSE_USERNAME,
    "password": CLICKHOUSE_PASSWORD
}

# Creating test DataFrame
data = [(1, 'John Doe'), (2, 'Jane Doe')]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)

In [4]:
# Saving DataFrame to ClickHouse
df.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", 'super') \
    .option("driver", "com.clickhouse.jdbc.ClickHouseDriver") \
    .option("user", CLICKHOUSE_USERNAME) \
    .option("password", CLICKHOUSE_PASSWORD) \
    .option("createTableOptions", "ENGINE=MergeTree() ORDER BY id SETTINGS index_granularity=8192") \
    .mode("overwrite") \
    .save()

In [5]:
# Reading DataFrame from ClickHouse
spark.read.jdbc(url=jdbc_url, table="super", properties=jdbc_properties).show()

+---+--------+
| id|    name|
+---+--------+
|  1|John Doe|
|  2|Jane Doe|
+---+--------+

