In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import (
    from_json, col, to_timestamp, to_date, current_timestamp
)

spark = SparkSession.builder \
    .appName("spark_kafka") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "1g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()
    # set log level to WARN for spark internals to reduce noise (tùy bạn)
spark.sparkContext.setLogLevel("WARN")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/16 08:59:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark.sql("""
CREATE TABLE IF NOT EXISTS iceberg.bronze.user_event (
    event_time TIMESTAMP,
    event_date DATE,
    event_type STRING,
    user_id STRING,
    user_session STRING,
    product_id STRING,
    price INT,
    kafka_partition INT,
    kafka_offset INT,
    ingest_ts TIMESTAMP
)
PARTITIONED BY (event_date)
""")


DataFrame[]

In [8]:
# =========================
# 1. Schema Kafka message
# =========================
event_schema = StructType([
    StructField("event_time", StringType()),
    StructField("event_type", StringType()),
    StructField("user_id", StringType()),
    StructField("user_session", StringType()),
    StructField("product_id", StringType()),
    StructField("price", StringType())
])

# =========================
# 2. Read from Kafka
# =========================
kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka1:9092")
    .option("subscribe", "user_event")
    .option("startingOffsets", "earliest")
    .load()
)

# =========================
# 3. Parse + cast (RAW)
# =========================
bronze_df = (
    kafka_df
    .select(
        from_json(col("value").cast("string"), event_schema).alias("data"),
        col("partition").alias("kafka_partition"),
        col("offset").alias("kafka_offset")
    )
    .select(
        to_timestamp(col("data.event_time")).alias("event_time"),
        to_date(to_timestamp(col("data.event_time"))).alias("event_date"),
        col("data.event_type"),
        col("data.user_id"),
        col("data.user_session"),
        col("data.product_id"),
        (col("data.price").cast("int")).alias("price"),
        col("kafka_partition").cast('int'),
        col("kafka_offset").cast('int'),
        current_timestamp().alias("ingest_ts")
    )
)

# =========================
# 4. Write to Iceberg Bronze
# =========================
query = (
    bronze_df.writeStream
    .format("iceberg")
    .outputMode("append")
    .option(
        "checkpointLocation",
        "s3a://warehouse/checkpoints/bronze/user_events"
    )
    .trigger(processingTime='1 minute')
    .toTable("iceberg.bronze.user_event")
)

query.awaitTermination()

25/12/16 09:04:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/12/16 09:04:20 WARN S3FileIO: Unclosed S3FileIO instance created by:
	org.apache.iceberg.aws.s3.S3FileIO.initialize(S3FileIO.java:444)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:402)
	org.apache.iceberg.CatalogUtil.loadFileIO(CatalogUtil.java:349)
	org.apache.iceberg.hive.HiveCatalog.initialize(HiveCatalog.java:123)
	org.apache.iceberg.CatalogUtil.loadCatalog(CatalogUtil.java:277)
	org.apache.iceberg.CatalogUtil.buildIcebergCatalog(CatalogUtil.java:331)
	org.apache.iceberg.spark.SparkCatalog.buildIcebergCatalog(SparkCatalog.java:153)
	org.apache.iceberg.spark.SparkCatalog.initialize(SparkCatalog.java:752)
	org.apache.spark.sql.connector.catalog.Catalogs$.load(Catalogs.scala:65)
	org.apache.spark.sql.connector.catalog.CatalogManager.$anonfun$catalog$1(CatalogManager.scala:54)
	scala.collection.mutable.HashMap.getOrElseUpdate(

KeyboardInterrupt: 