In [1]:
# Cell 1 – Install Python deps (inside the container)
# Only needed if your image doesn't have these already.
# In the jupyter/pyspark-notebook image, pyspark is present;
# findspark is nice to have but optional.

!pip install -q pyspark findspark


In [2]:
# Cell 2 – Start SparkSession configured for Hudi
from pyspark.sql import SparkSession

HUDI_VERSION = "0.15.0"
SPARK_MAJOR = "3.5"   # Change to match your Spark (3.5, 3.4, 3.3, ...)

hudi_bundle = f"org.apache.hudi:hudi-spark{SPARK_MAJOR}-bundle_2.12:{HUDI_VERSION}"

spark = (
    SparkSession.builder
    .appName("Apache Hudi Chapter 4 Demo")
    .config("spark.jars.packages", hudi_bundle)
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config(
        "spark.sql.extensions",
        "org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
    )
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    )
    .getOrCreate()
)

spark


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 41326)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [3]:
# Cell 3 – Simple example DataFrame (Spark + Hudi, COW)
from pyspark.sql import Row

data = [
    Row(customer_id=1, first_name="John", last_name="Doe",
        email="john.doe@example.com", charges=150.75, state="CA"),
    Row(customer_id=2, first_name="Jane", last_name="Smith",
        email="jane.smith@example.com", charges=950.00, state="NY"),
    Row(customer_id=3, first_name="Tom", last_name="Lee",
        email="tom.lee@example.com", charges=1200.00, state="CA"),
]

df = spark.createDataFrame(data)
df.show()


+-----------+----------+---------+--------------------+-------+-----+
|customer_id|first_name|last_name|               email|charges|state|
+-----------+----------+---------+--------------------+-------+-----+
|          1|      John|      Doe|john.doe@example.com| 150.75|   CA|
|          2|      Jane|    Smith|jane.smith@exampl...|  950.0|   NY|
|          3|       Tom|      Lee| tom.lee@example.com| 1200.0|   CA|
+-----------+----------+---------+--------------------+-------+-----+



In [4]:
# Cell 4 – Write a basic Copy‑On‑Write Hudi table
base_path = "/tmp/hudi_customers_cow"
table_name = "hudi_customers_cow"

(
    df.write
    .format("hudi")
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE")
    .option("hoodie.table.name", table_name)
    .option("hoodie.datasource.write.recordkey.field", "customer_id")
    .option("hoodie.datasource.write.partitionpath.field", "state")
    .option("hoodie.datasource.write.precombine.field", "charges")
    .option("hoodie.datasource.write.hive_style_partitioning", "true")
    .mode("overwrite")
    .save(base_path)
)



In [5]:
# Cell 5 – Register table & basic snapshot query
spark.sql(f"DROP TABLE IF EXISTS customers_hudi_demo")

spark.sql(f"""
CREATE TABLE customers_hudi_demo (
  customer_id INT,
  first_name  STRING,
  last_name   STRING,
  email       STRING,
  charges     FLOAT,
  state       STRING
)
USING hudi
LOCATION '{base_path}'
PARTITIONED BY (state)
""")

spark.sql("SELECT * FROM customers_hudi_demo").show()



+-------------------+--------------------+------------------+----------------------+--------------------+-----------+----------+---------+--------------------+-------+-----+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|customer_id|first_name|last_name|               email|charges|state|
+-------------------+--------------------+------------------+----------------------+--------------------+-----------+----------+---------+--------------------+-------+-----+
|  20251209052148125|20251209052148125...|                 3|              state=CA|1d9d92ae-0c92-481...|          3|       Tom|      Lee| tom.lee@example.com| 1200.0|   CA|
|  20251209052148125|20251209052148125...|                 1|              state=CA|1d9d92ae-0c92-481...|          1|      John|      Doe|john.doe@example.com| 150.75|   CA|
|  20251209052148125|20251209052148125...|                 2|              state=NY|274f850c-ff58-411...|          2|      Jane|  

In [6]:
# Cell 6 – DDL examples (CREATE, CTAS, ALTER, DROP)
# CREATE TABLE (already done above, so we just re-show for completeness)
spark.sql("""
CREATE TABLE IF NOT EXISTS customers_hudi_demo (
  customer_id INT,
  first_name STRING,
  last_name  STRING,
  email      STRING,
  charges    FLOAT,
  state      STRING
)
USING hudi
PARTITIONED BY (state)
""")

spark.sql("""
DROP TABLE IF EXISTS high_value_customers PURGE
""")

# CTAS example – high_value_customers
spark.sql("""
CREATE TABLE IF NOT EXISTS high_value_customers
USING hudi
PARTITIONED BY (state)
AS
SELECT
  customer_id, first_name, last_name, state, charges
FROM customers_hudi_demo
WHERE charges > 1000
""")

# ALTER TABLE – add / rename / drop column
spark.sql("""
ALTER TABLE customers_hudi_demo
ADD COLUMN phone_number STRING
""")

# This is not supported in Hudi
# spark.sql("""
# ALTER TABLE customers_hudi_demo
# DROP COLUMN phone_number
# """)
# Hudi/Spark SQL does NOT support ALTER TABLE ... RENAME COLUMN directly. Use CTAS workaround:
spark.sql("""
CREATE TABLE customers_hudi_demo_v2
USING hudi
PARTITIONED BY (state)
AS
SELECT customer_id, first_name, last_name, email, charges AS total_spent, phone_number, state
FROM customers_hudi_demo
""")

# Optionally drop old table and rename new one to maintain continuity
spark.sql("DROP TABLE customers_hudi_demo")
spark.sql("ALTER TABLE customers_hudi_demo_v2 RENAME TO customers_hudi_demo")


# DROP TABLE (catalog only)
spark.sql("""
DROP TABLE IF EXISTS high_value_customers PURGE
""")

# DROP TABLE and delete data – the PURGE syntax in the chapter
spark.sql("""
DROP TABLE IF EXISTS customers_hudi_demo PURGE
""")



DataFrame[]

In [7]:
# Cell 7 – DML: INSERT, MERGE, INSERT OVERWRITE, DELETE, UPDATE
# Recreate customers table for DML demo
spark.sql(f"""
CREATE TABLE customers_hudi_demo (
  customer_id INT,
  first_name  STRING,
  last_name   STRING,
  email       STRING,
  charges     FLOAT,
  state       STRING
)
USING hudi
PARTITIONED BY (state)
""")

spark.sql("""
INSERT INTO customers_hudi_demo
VALUES (1, 'John', 'Doe', 'john.doe@example.com', 150.75, 'CA')
""")

spark.sql("""
INSERT INTO customers_hudi_demo
VALUES (2, 'Jane', 'Smith', 'jane.smith@example.com', 250.00, 'NY')
""")

spark.sql("SELECT * FROM customers_hudi_demo").show()

# MERGE INTO (upsert)
spark.sql("""
CREATE OR REPLACE TEMP VIEW updates AS
SELECT 1 AS customer_id, 'John' AS first_name, 'Doe' AS last_name,
       'john.new@example.com' AS email, 200.00 AS charges, 'CA' AS state
UNION ALL
SELECT 3, 'Alice', 'Brown', 'alice@example.com', 300.00, 'TX'
""")

spark.sql("""
MERGE INTO customers_hudi_demo AS target
USING updates AS source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *
""")

spark.sql("SELECT * FROM customers_hudi_demo").show()

# INSERT OVERWRITE (static & dynamic)
# Static overwrite of a single partition (CA)
spark.sql("""
CREATE OR REPLACE TEMP VIEW staging_updates AS
SELECT 1 AS customer_id, 'John', 'Doe', 'john.ca@example.com', 220.0, 'CA'
""")

spark.sql("""
INSERT OVERWRITE customers_hudi_demo
PARTITION (state = 'CA')
SELECT customer_id, first_name, last_name, email, charges, state
FROM staging_updates
WHERE state = 'CA'
""")

# Dynamic overwrite for multiple partitions
spark.sql("""
INSERT OVERWRITE customers_hudi_demo
SELECT customer_id, first_name, last_name, email, charges, state
FROM staging_updates
WHERE state IN ('CA')
GROUP BY customer_id, first_name, last_name, email, charges, state
""")

# DELETE
# Row-level delete
spark.sql("""
DELETE FROM customers_hudi_demo
WHERE customer_id = 1
""")

# Partition-level delete (metadata-only if partitioned by state)
spark.sql("""
DELETE FROM customers_hudi_demo
WHERE state = 'CA'
""")

# UPDATE
spark.sql("""
UPDATE customers_hudi_demo
SET charges = charges * 1.1
WHERE state = 'NY'
""")



+-------------------+--------------------+--------------------+----------------------+--------------------+-----------+----------+---------+--------------------+-------+-----+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|customer_id|first_name|last_name|               email|charges|state|
+-------------------+--------------------+--------------------+----------------------+--------------------+-----------+----------+---------+--------------------+-------+-----+
|  20251209052153180|20251209052153180...|20251209052153180...|              state=NY|74d82a41-9353-440...|          2|      Jane|    Smith|jane.smith@exampl...|  250.0|   NY|
|  20251209052152483|20251209052152483...|20251209052152483...|              state=CA|93064a8a-def8-4f4...|          1|      John|      Doe|john.doe@example.com| 150.75|   CA|
+-------------------+--------------------+--------------------+----------------------+--------------------+-----------+-

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `state` cannot be resolved. Did you mean one of the following? [`CA`, `Doe`, `John`, `220.0`, `customer_id`].; line 6 pos 6;
'InsertIntoStatement Relation spark_catalog.default.customers_hudi_demo[customer_id#738,first_name#739,last_name#740,email#741,charges#742,state#743] parquet, [state=Some(CA)], true, false, false
+- 'Project ['customer_id, 'first_name, 'last_name, 'email, 'charges, 'state]
   +- 'Filter ('state = CA)
      +- SubqueryAlias staging_updates
         +- View (`staging_updates`, [customer_id#808,John#809,Doe#810,john.ca@example.com#811,220.0#812,CA#813])
            +- Project [cast(customer_id#807 as int) AS customer_id#808, cast(John#814 as string) AS John#809, cast(Doe#815 as string) AS Doe#810, cast(john.ca@example.com#816 as string) AS john.ca@example.com#811, cast(220.0#817 as decimal(4,1)) AS 220.0#812, cast(CA#818 as string) AS CA#813]
               +- Project [1 AS customer_id#807, John AS John#814, Doe AS Doe#815, john.ca@example.com AS john.ca@example.com#816, 220.0 AS 220.0#817, CA AS CA#818]
                  +- OneRowRelation


In [None]:
# Cell 8 – Read queries: snapshot, time travel, CDC, incremental
# Snapshot queries with metadata + record index
spark.sql("SET hoodie.enable.data.skipping=true")
spark.sql("SET hoodie.metadata.column.stats.enable=true")
spark.sql("SET hoodie.metadata.enable=true")

spark.sql("""
SELECT *
FROM customers_hudi_demo
WHERE charges > 1.0 AND charges < 1000.0
""").show()

spark.sql("SET hoodie.metadata.record.index.enable=true")
spark.sql("""
SELECT *
FROM customers_hudi_demo
WHERE customer_id = 2
""").show()

# Time travel query (syntax template)
time_travel_sql = """
SELECT *
FROM customers_hudi_demo
TIMESTAMP AS OF '2025-01-01 00:00:00.000'
WHERE charges > 100.0
"""
print(time_travel_sql)
# spark.sql(time_travel_sql).show()  # Uncomment when you have a valid timestamp

# CDC / incremental queries – use the hudi_table_changes TVF
cdc_template = """
SELECT *
FROM hudi_table_changes(
  'customers_hudi_demo',
  'cdc',
  'earliest',  -- or starting commit time
  NULL         -- optional end time
)
"""

latest_state_template = """
SELECT *
FROM hudi_table_changes(
  'customers_hudi_demo',
  'latest_state',
  'earliest',  -- or starting commit time
  NULL         -- optional end time
)
"""
print("-- CDC template:\n", cdc_template)
print("\n-- Incremental latest_state template:\n", latest_state_template)



In [None]:
# Cell 9 – Common Hudi config “cheat sheet” from the chapter
hudi_config_cheatsheet = {
    # Schema evolution
    "hoodie.write.set.null.for.missing.columns": "true",
    "hoodie.schema.on.read.enable": "true",

    # CDC logging mode
    "hoodie.table.cdc.supplemental.logging.mode": "op_key,op_old,op_new",

    # Compaction (MoR)
    "hoodie.compact.inline.max.delta.commits": "10",
    "hoodie.datasource.compaction.async.enable": "true",
    "hoodie.compact.inline": "true",

    # Cleaning / retention
    "hoodie.clean.automatic": "true",
    "hoodie.cleaner.commits.retained": "10",
    "hoodie.clean.async": "true",

    # File sizing
    "hoodie.parquet.small.file.limit": "104857600",   # 100 MB
    "hoodie.parquet.max.file.size": "125829120",      # 120 MB
    "hoodie.copyonwrite.record.size.estimate": "1024",
    "hoodie.merge.small.file.group.candidates.limit": "5",
    "hoodie.logfile.max.size": "1073741824",          # 1 GB

    # Clustering
    "hoodie.clustering.plan.strategy.small.file.limit": "134217728",
    "hoodie.clustering.plan.strategy.target.file.max.bytes": "134217728",

    # Misc retention
    "hoodie.keep.max.commits": "20",
    "hoodie.cleaner.fileversions.retained": "20",
}

hudi_config_cheatsheet


In [None]:
# Cell 10 – Stop Spark when done
spark.stop()



# Section 4: Flink + Hudi code snippets from the chapter

The following commands are intended for a shell or Flink SQL, not Python:

## 4.1 Environment setup
```bash
export FLINK_VERSION=1.17
export HUDI_VERSION=0.15.0
export HADOOP_HOME=/path/to/hadoop
export HADOOP_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
$FLINK_HOME/bin/start-cluster.sh
wget \
  "https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink${FLINK_VERSION}-bundle/${HUDI_VERSION}/hudi-flink${FLINK_VERSION}-bundle-${HUDI_VERSION}.jar" \
  -P "$FLINK_HOME/lib/"
$FLINK_HOME/bin/sql-client.sh embedded \
  -j "lib/hudi-flink${FLINK_VERSION}-bundle-${HUDI_VERSION}.jar" \
  shell
```

## 4.2 Flink SQL Examples
```sql
CREATE CATALOG hudi_catalog
WITH (
  'type' = 'hudi',
  'catalog.path' = 'file:///tmp/hudi_catalog',
  'hive.conf.dir' = '/path/to/hive/conf',
  'mode' = 'hms'
);
USE CATALOG hudi_catalog;
CREATE DATABASE db;
USE db;
CREATE TABLE product_daily_price (
  id   BIGINT PRIMARY KEY NOT ENFORCED,
  name STRING,
  price DOUBLE,
  ts   BIGINT,
  dt   STRING
)
PARTITIONED BY (dt)
WITH (
  'connector' = 'hudi',
  'path' = 'file:///tmp/hudi_table',
  'table.type' = 'MERGE_ON_READ',
  'precombine.field' = 'ts',
  'hoodie.cleaner.fileversions.retained' = '20',
  'hoodie.keep.max.commits' = '20',
  'hoodie.datasource.write.hive_style_partitioning' = 'true'
);
INSERT INTO product_daily_price
SELECT 1, 'Lakehouse Book', 50, 1732256367, '2024-11-21';
INSERT INTO product_daily_price + OPTIONS('write.operation' = 'upsert')
SELECT 1, 'Lakehouse Book', 60, 1732256367, '2024-11-21';
UPDATE product_daily_price
SET price = price * 2, ts = 1732258867
WHERE id = 1;
DELETE FROM product_daily_price
WHERE price < 50;
INSERT INTO product_daily_price + OPTIONS('hoodie.keep.max.commits' = '10')
SELECT 2, 'Another Book', 40, 1732256367, '2024-11-21';
```

