# Configure Iceberg + Apache Spark + Hadoop Catalog

In [1]:
# Define the AWS env variables if you are using AWS Auth:
%env AWS_REGION= us-east-2
%env AWS_ACCESS_KEY_ID= key
%env AWS_SECRET_ACCESS_KEY= secret

env: AWS_REGION=us-east-2
env: AWS_ACCESS_KEY_ID=key
env: AWS_SECRET_ACCESS_KEY=secret


In [2]:
import pyspark
from pyspark.sql import SparkSession
import os


conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
        .set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.4.3,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
        .set('spark.sql.catalog.hdfs_catalog', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.hdfs_catalog.type', 'hadoop')
        .set('spark.sql.catalog.hdfs_catalog.warehouse', 's3a://diplakehouse/test_iceberg_book/')
        .set('spark.sql.catalog.hdfs_catalog.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
        .set('spark.hadoop.fs.s3a.access.key', 'key')
        .set('spark.hadoop.fs.s3a.secret.key', 'secret')
)

## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")


:: loading settings :: url = jar:file:/home/docker/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/docker/.ivy2/cache
The jars for the packages stored in: /home/docker/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-aadf4fc2-2a29-4947-9c1b-f45b3d576f79;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.4.3 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found software.amazon.awssdk#utils;2.17.178 in central
	found org.reactiv

25/01/09 00:15:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/09 00:15:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
Spark Running


# Create Table Customers using Spark SQL

In [3]:
spark.sql("""
    CREATE TABLE hdfs_catalog.customers (
        customer_id INT,
        first_name STRING,
        last_name STRING,
        email STRING,
        charges FLOAT,
        state STRING)
    USING iceberg
    PARTITIONED BY (state)
""")

25/01/08 22:41:31 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


DataFrame[]

# Create Table Customers using Spark DataFrame API

In [22]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType 

from pyspark.sql.functions import col 

schema = StructType([ 

    StructField("customer_id", IntegerType(), True), 

    StructField("first_name", StringType(), True), 

    StructField("last_name", StringType(), True), 

    StructField("email", StringType(), True), 

    StructField("charges", FloatType(), True), 

    StructField("state", StringType(), True) 

]) 

df = spark.createDataFrame([], schema) 

df.writeTo("hdfs_catalog.customers_new").partitionedBy(col("state")).create() 

                                                                                

# Create Table As (CTAS)

In [6]:
spark.sql(""" 
    CREATE TABLE hdfs_catalog.high_value_customers 
    USING iceberg
    PARTITIONED BY (state)
    AS SELECT customer_id, first_name, last_name, state, charges
    FROM hdfs_catalog.customers 
    WHERE charges > 100
""") 

                                                                                

DataFrame[]

# Create Table As (CTAS) using DataFrame API

In [None]:
df_ctas = spark.read.table("hdfs_catalog.customers") 

df_ctas.filter(df_ctas.charges > 1000) \ 
       .select("customer_id", "first_name", "last_name", "state", "charges") \ 
       .writeTo("hdfs_catalog.high_value_customers") \ 
       .partitionedBy("state") \ 
       .create() 

# Drop a table without deleting its data files

In [None]:
spark.sql("DROP TABLE IF EXISTS hdfs_catalog.customers")

# Drop a table and delete its data files

In [None]:
spark.sql("DROP TABLE IF EXISTS hdfs_catalog.customers PURGE") 

# Alter Table

## Add Column:

In [23]:
spark.sql(""" 
    ALTER TABLE hdfs_catalog.customers
    ADD COLUMN phone_number STRING
""") 

DataFrame[]

## Rename Column:

In [None]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers
    RENAME COLUMN charges TO total_spent
""")

## Drop Column:

In [24]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers
    DROP COLUMN phone_number
""")

DataFrame[]

## Add Partition Field:

In [None]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers
    ADD PARTITION FIELD bucket(16, customer_id)
""")

## Create Branch:

In [31]:
spark.sql("ALTER TABLE hdfs_catalog.customers CREATE BRANCH dev_branch")

DataFrame[]

## Create Branch by retaining snapshots for 30 days keeping at least the latest 3 snapshots, plus any snapshots created in the past 2 days at snapshot version 1234:

In [None]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers CREATE BRANCH audit_branch 
    AS OF VERSION 1234 
    RETAIN 30 DAYS 
    WITH SNAPSHOT RETENTION 3 SNAPSHOTS 2 DAYS
""")

## Create Tags:

In [33]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers
    CREATE TAG EOY_tag
""")

DataFrame[]

## Create Tag at snapshot ID 1234, to retain a specific historical view of the table for analysis purposes:

In [None]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers CREATE TAG historical_tag 
    AS OF VERSION 1234
""")

## Drop a Branch and Tag:

In [None]:
spark.sql("""
    ALTER TABLE hdfs_catalog.customers DROP BRANCH dev_branch 
""")

spark.sql("""
    ALTER TABLE hdfs_catalog.customers DROP TAG EOY_tag  
""")

# Insert Records

In [4]:
spark.sql("""
    INSERT INTO hdfs_catalog.customers VALUES
        (1, 'John', 'Doe', 'john.doe@fakemail.co', 123.45, 'CA'),
        (2, 'Jane', 'Smith', 'jane.smith@mockmail.org', 89.99, 'NY'),
        (3, 'Alice', 'Johnson', 'alice.j@samplemail.net', 150.75, 'TX'),
        (4, 'Bob', 'Brown', 'bob_brown@myemail.biz', 200.00, 'FL'),
        (5, 'Eve', 'Davis', 'eve.davis@demoemail.com', 75.50, 'WA')
""")

                                                                                

DataFrame[]

# Merge Into/Upserts

In [10]:
## Make sure to create the `updates` table first:
spark.sql("""
    CREATE TABLE hdfs_catalog.updates (
        customer_id INT,
        first_name STRING,
        last_name STRING,
        email STRING,
        charges FLOAT,
        state STRING
    )
    USING iceberg
""")

DataFrame[]

In [11]:
## Insert records into `updates`:
spark.sql("""
    INSERT INTO hdfs_catalog.updates VALUES
        (1, 'John', 'Doe', 'john.doe@fakemail.co', 130.00, 'CA'), 
        (6, 'Chris', 'Evans', 'chris.evans@hollywood.com', 300.00, 'CA'),
        (7, 'Natasha', 'Romanoff', 'natasha.r@spyworld.com', 180.50, 'NY')
""")

                                                                                

DataFrame[]

In [None]:
## Run the Upsert:
spark.sql("""
MERGE INTO hdfs_catalog.customers AS target 
USING hdfs_catalog.updates AS source 
ON target.customer_id = source.customer_id 
WHEN MATCHED THEN 
  UPDATE SET * 
WHEN NOT MATCHED THEN 
  INSERT * 
""")

# Insert Overwrite

In [None]:
# Static Overwrite Mode: Overwrites only the "CA" partition 
    
spark.sql("""
INSERT OVERWRITE hdfs_catalog.customers 
PARTITION (state = 'CA') 
SELECT customer_id, first_name, last_name, charges, email
FROM hdfs_catalog.customers 
WHERE state = 'CA' 
GROUP BY customer_id 
""")

In [None]:
# Dynamic Overwrite Mode: Overwrites all partitions with data in the query result 
    
spark.sql("""
INSERT OVERWRITE hdfs_catalog.customers 
SELECT customer_id, first_name, last_name, email, charges 
FROM hdfs_catalog.customers 
WHERE state IN ('CA', 'NY') 
GROUP BY customer_id 
""")

# Deletes

## Row-Level Delete:

In [35]:
spark.sql("""
DELETE FROM hdfs_catalog.customers 
WHERE customer_id = 1
""")

25/01/09 00:49:59 WARN S3InputStream: Unclosed input stream created by:
	org.apache.iceberg.aws.s3.S3InputStream.<init>(S3InputStream.java:74)
	org.apache.iceberg.aws.s3.S3InputFile.newStream(S3InputFile.java:85)
	org.apache.iceberg.avro.AvroIterable.newFileReader(AvroIterable.java:100)
	org.apache.iceberg.avro.AvroIterable.iterator(AvroIterable.java:76)
	org.apache.iceberg.io.CloseableIterable$7$1.<init>(CloseableIterable.java:188)
	org.apache.iceberg.io.CloseableIterable$7.iterator(CloseableIterable.java:187)
	org.apache.iceberg.io.CloseableIterable.lambda$filter$0(CloseableIterable.java:109)
	org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)
	org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)
	org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.java:72)
	org.apache.iceberg.io.CloseableIterable.lambda$filter$1(CloseableIterable.java:136)
	org.apache.iceberg.io.CloseableIterable$2.iterator(CloseableIterable.

                                                                                

DataFrame[]

## Partition-Level Delete:

In [None]:
spark.sql("""
DELETE FROM hdfs_catalog.customers 
WHERE state = 'WA'  
""")

# Updates

In [12]:
spark.sql("""
UPDATE hdfs_catalog.customers 
SET charges = charges * 1.1 
WHERE state = 'CA'  
""")

                                                                                

DataFrame[]

# Read Query using Spark SQL

In [5]:
spark.sql("SELECT * FROM hdfs_catalog.customers;").show()

                                                                                

+-----------+----------+---------+--------------------+-------+-----+
|customer_id|first_name|last_name|               email|charges|state|
+-----------+----------+---------+--------------------+-------+-----+
|          1|      John|      Doe|john.doe@fakemail.co| 123.45|   CA|
|          5|       Eve|    Davis|eve.davis@demoema...|   75.5|   WA|
|          2|      Jane|    Smith|jane.smith@mockma...|  89.99|   NY|
|          3|     Alice|  Johnson|alice.j@samplemai...| 150.75|   TX|
|          4|       Bob|    Brown|bob_brown@myemail...|  200.0|   FL|
+-----------+----------+---------+--------------------+-------+-----+



In [7]:
spark.sql("SELECT * FROM hdfs_catalog.high_value_customers").show()

                                                                                

+-----------+----------+---------+-----+-------+
|customer_id|first_name|last_name|state|charges|
+-----------+----------+---------+-----+-------+
|          1|      John|      Doe|   CA| 123.45|
|          3|     Alice|  Johnson|   TX| 150.75|
|          4|       Bob|    Brown|   FL|  200.0|
+-----------+----------+---------+-----+-------+



In [36]:
spark.sql("SELECT * FROM hdfs_catalog.customers;").show()

                                                                                

+-----------+----------+---------+--------------------+-------+-----+
|customer_id|first_name|last_name|               email|charges|state|
+-----------+----------+---------+--------------------+-------+-----+
|          6|     Chris|    Evans|chris.evans@holly...|  330.0|   CA|
|          5|       Eve|    Davis|eve.davis@demoema...|   75.5|   WA|
|          2|      Jane|    Smith|jane.smith@mockma...|  89.99|   NY|
|          3|     Alice|  Johnson|alice.j@samplemai...| 150.75|   TX|
|          4|       Bob|    Brown|bob_brown@myemail...|  200.0|   FL|
|          7|   Natasha| Romanoff|natasha.r@spyworl...|  180.5|   NY|
+-----------+----------+---------+--------------------+-------+-----+



# Read Query using PySpark DataFrame API

In [None]:
df = spark.table("hadoop_catalog.customers").filter("state = 'CA'") 
df.show() 

# Time Travel Reads

In [None]:
spark.sql("SELECT * FROM hadoop_catalog.customers TIMESTAMP AS OF '2024-10-26 15:30:00'")
spark.sql("SELECT * FROM hadoop_catalog.customers VERSION AS OF 12345678901234")

# Iceberg Procedures

## Expire Snapshot:

In [21]:
spark.sql("CALL hdfs_catalog.system.expire_snapshots(table => 'hdfs_catalog.customers', older_than => TIMESTAMP '2023-08-01 00:00:00', retain_last => 5 )")

                                                                                

DataFrame[deleted_data_files_count: bigint, deleted_position_delete_files_count: bigint, deleted_equality_delete_files_count: bigint, deleted_manifest_files_count: bigint, deleted_manifest_lists_count: bigint, deleted_statistics_files_count: bigint]

## Rollback to Snapshot:

In [None]:
spark.sql("CALL hdfs_catalog.system.rollback_to_snapshot(table => 'hdfs_catalog.customers', snapshot_id => 2711640443788239783 )")

## Remove Orphan Files:

In [None]:
# Dry Run
spark.sql("CALL hdfs_catalog.system.remove_orphan_files(table => 'hdfs_catalog.customers', dry_run => true)")

# Run Procedure
spark.sql("CALL hdfs_catalog.system.remove_orphan_files(table => 'hdfs_catalog.customers',location => 's3a://diplakehouse/iceberg_book/customers/data' )")

## Rewrite Data Files:

In [20]:
spark.sql(" CALL hdfs_catalog.system.rewrite_data_files(table => 'hdfs_catalog.customers', strategy => 'binpack')")

DataFrame[rewritten_data_files_count: int, added_data_files_count: int, rewritten_bytes_count: bigint]

## Add Files:

In [None]:
spark.sql("CALL hdfs_catalog.system.add_files(table => 'hdfs_catalog.customers', path => 's3://my-bucket/new_data/')")