In [1]:
import pandas as pd

df = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ["Alice", "Bob", "Charlie"]
})

df.to_parquet("/app/sample.parquet", index=False)

In [2]:
import os
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("IcebergExample")
    .config("spark.jars", "/home/spark/jars/*")  
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2")   
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.sql.catalog.hadoop_cat", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.hadoop_cat.type", "hive")
    .config("spark.sql.catalog.hadoop_cat.uri", "thrift://metastore:9083")
    .config("spark.sql.catalog.hadoop_cat.warehouse", f"s3a://{os.environ['WAREHOUSE_BUCKET_NAME']}/warehouse")
    .getOrCreate()
)

In [3]:
# Read the Parquet file from MinIO bucket "source"
df = spark.read.parquet("s3a://raw/sample.parquet")

df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS hadoop_cat.db")

In [None]:
spark.sql("SHOW DATABASES IN hadoop_cat").show()

In [None]:
spark.sql("DESCRIBE DATABASE hadoop_cat.db").show()

In [None]:
spark.sql("DESCRIBE DATABASE db").show()

In [None]:
# Write to Iceberg table in the "warehouse" bucket
df.writeTo("hadoop_cat.db.people").createOrReplace()

In [None]:
spark.sql("SELECT * FROM hadoop_cat.db.people").show()