In [None]:
spark.stop()

In [1]:
import os
print(os.environ.get("JAVA_HOME"))

/usr/lib/jvm/default-java


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("test_notebook") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/31 03:54:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.test.test_iceberg_v1_error')
df.printSchema()

root
 |-- run_id: string (nullable = true)
 |-- error_type: string (nullable = true)
 |-- expectation_type: string (nullable = true)
 |-- column_name: string (nullable = true)
 |-- run_time: timestamp_ntz (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [3]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.test.test_iceberg')
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [4]:
df.show()

                                                                                

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [5]:
df = spark.read.csv("s3a://linh-test/test.csv", header=True, inferSchema=True)
df.show()

+---+------------+---+-----------+------+
| id|        name|age|       city|salary|
+---+------------+---+-----------+------+
|  1|Nguyen Van A| 30|      Hanoi|   500|
|  2|  Tran Thi B| 25|Ho Chi Minh|   600|
|  3|    Le Van C| 28|    Da Nang|   550|
|  4|  Pham Thi D| 32|      Hanoi|   700|
|  5| Hoang Van E| 27|    Can Tho|   480|
+---+------------+---+-----------+------+



In [4]:
df = (
    spark.read
    .format("jdbc")
    .option("url", "jdbc:mariadb://mariadb:3306/database_raw")
    .option(
        "dbtable",
        """
        (
            SELECT table_name
            FROM information_schema.tables
            WHERE table_schema = 'database_raw'
        ) t
        """
    ) \
    .option("user", "admin")
    .option("password", "admin")
    .option("driver", "org.mariadb.jdbc.Driver")
    .load()
)

df.printSchema()
df.show()

root
 |-- table_name: string (nullable = true)





+----------+
|table_name|
+----------+
+----------+



                                                                                

In [3]:
df = spark.readStream \
    .format('iceberg') \
    .load('iceberg.test.test_spark_kafka') 
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- user_id: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- category: string (nullable = true)



In [7]:
df.writeStream \
  .format("console") \
  .outputMode("append") \
  .option(
      "checkpointLocation",
      "s3a://warehouse/checkpoints/debug_console_iceberg"
  ) \
  .start()

<pyspark.sql.streaming.query.StreamingQuery at 0x7f7c4fc36170>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+--------------------+-------+------+--------+
|    id|          created_at|user_id|amount|category|
+------+--------------------+-------+------+--------+
| evt-0|2025-12-08 20:06:...|    710|855.85|purchase|
| evt-1|2025-12-08 20:06:...|    966|201.39|   login|
| evt-2|2025-12-08 20:06:...|    498|446.07|   click|
| evt-3|2025-12-08 20:06:...|    308|679.48|purchase|
| evt-4|2025-12-08 20:06:...|    921| 89.11|purchase|
| evt-5|2025-12-08 20:06:...|    153| 79.46|   click|
| evt-6|2025-12-08 20:06:...|    579|685.47|   login|
| evt-7|2025-12-08 20:06:...|    986|871.33|purchase|
| evt-8|2025-12-08 20:06:...|    103|523.12|purchase|
| evt-9|2025-12-08 20:06:...|    694|990.05|purchase|
|evt-10|2025-12-08 20:06:...|    697|377.69|   login|
|evt-11|2025-12-08 20:06:...|    387|980.67|purchase|
|evt-12|2025-12-08 20:06:...|    335|722.32|   login|
|evt-13|2025-12-08 20:06:...|    730| 3

In [5]:
df = spark.readStream \
    .format('iceberg') \
    .load('iceberg.test.test_spark_kafka') 
df.writeStream \
  .format("console") \
  .outputMode("append") \
  .option(
      "checkpointLocation",
      "s3a://warehouse/checkpoints/debug_console_iceberg"
  ) \
  .start()


<pyspark.sql.streaming.query.StreamingQuery at 0x7f80c3772c50>

In [3]:
# -- Chọn catalog 'iceberg', schema/database 'test', table 'test_spark_kafka'
sql = """CREATE TABLE IF NOT EXISTS iceberg.test.test_spark_kafka (
    id STRING,
    created_at TIMESTAMP,
    user_id BIGINT,
    amount DOUBLE,
    category STRING
)
USING iceberg
TBLPROPERTIES (
    'format-version'='2',        -- Iceberg format version 2
    'write.format.default'='parquet'  -- định dạng file, bạn có thể dùng parquet hoặc orc
);"""
spark.sql(sql)

DataFrame[]

In [5]:
df = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('s3a://linh-test/amazon-products.csv')
df.printSchema()

root
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- description: string (nullable = true)
 |-- initial_price: string (nullable = true)
 |-- final_price: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- buybox_seller: string (nullable = true)
 |-- number_of_sellers: string (nullable = true)
 |-- root_bs_rank: string (nullable = true)
 |-- answered_questions: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- images_count: string (nullable = true)
 |-- url: string (nullable = true)
 |-- video_count: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- item_weight: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- product_dimensions: stri

                                                                                

In [8]:
df.count()

1001

In [7]:
df.toPandas().head()

Unnamed: 0,timestamp,title,seller_name,brand,description,initial_price,final_price,currency,availability,reviews_count,...,root_bs_category,bs_category,bs_rank,badge,subcategory_rank,amazon_choice,images,product_details,prices_breakdown,country_of_origin
0,2023-08-08 00:00:00.000,Saucony Men's Kinvara 13 Running Shoe,Orv███tor███,Saucony,"When it comes to lightweight speed, nothing cr...",,"""""""57.79""""""",USD,In Stock,702,...,"""""name"""":""""12.5 White/Blck/Vizi""""}","{""""asin"""":""""B098PJCHNS""""","""""name"""":""""15 Vizigld/Vizired""""}","{""""asin"""":""""B098PFQNP2""""","""""name"""":""""10.5 Blue Raz/Black""""}","{""""asin"""":""""B098PJS8PN""""","""""name"""":""""12 Vizigld/Vizired""""}","{""""asin"""":""""B09WPN1WJZ""""","""""name"""":""""14 Composite""""}","{""""asin"""":""""B098PJS8PQ"""""
1,2023-08-09 00:00:00.000,Kishigo Premium Black Series Heavy Duty Unisex...,Ama███.co███,Kishigo,"""The Kishigo Premium Black Series Heavy Duty V...",with a clean and strong black trim. Made from...,this vest is designed for all day comfort and...,a right chest 2-tier pencil pocket,left chest heavy duty gusseted radio/utility ...,Kishigo has made it our mission to design and...,...,ATVPDKIKX0DER,"September 18, 2012",,1514 XL,Kishigo,Unisex,true,,false,Nice vest and comfort and great colors
2,2024-02-04 00:00:00.000,TWINSLUXES Solar Post Cap Lights Outdoor - Wat...,Twi███uxe███,TWINSLUXES,"""Solar Post Cap Lights Waterproof LED Fence Po...",vinyl posts or PVC posts The fence post lights...,deck and garden decoration practical and beau...,and during the day the solar panel atop of th...,charging the battery within so that energy ca...,and off at dawn. How to use solar post lights...,...,"""""Post Lights""""]""",B07V5LK5J3,Twinsluxes,1,38723,0,https://www.amazon.com/,1,https://www.amazon.com/Solar-Post-Cap-Lights-O...,0
3,2024-06-09 00:00:00.000,Accutire MS-4021B Digital Tire Pressure Gauge ...,Cit███ran███Dir██████,Accutire,About this item Heavy duty construction and ru...,1.795000000000000e+01,1.795000000000000e+01,USD,In Stock,8034,...,"""""Arrives before Father's Day""""]""","""[""""Heavy duty construction and rugged design ...","""""Angled head and rubber coated handle for eas...","""""Large",easy-to-read backlit LCD display; Easily swit...,bar,kPa,"kg/cm2""""","""""Equipped with an automatic shut off to conse...","""""Accurate to within 0.5 PSI"
4,2024-01-16 00:00:00.000,SAURA LIFE SCIENCE Adivasi Ayurvedic Neelgiri ...,PRA███ EN███PRI███,SAURA LIFE SCIENCE,This extraordinary fusion is designed to nouri...,"""""""1299""""""","""""""799""""""",INR,In stock,5,...,Amla,Aloevera,Tulsi,"Helps in controlling hair loss""""","""""No Side Effect.""""","""""Usage-Smoothens Hair",Repairs Hair,Hydrates Dry Hair,"Prevent Fizziness""""","""""Package Contains-250ML""""]"""


In [11]:
spark.sql('create namespace iceberg.gold')

DataFrame[]

In [12]:
df.write \
    .format('iceberg') \
    .saveAsTable('iceberg.bronze.amazon_products')

                                                                                

In [9]:
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("total_amount", IntegerType(), True),
])

In [11]:
df = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'false') \
    .schema(schema) \
    .load('./data/data_demo.csv')
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- total_amount: integer (nullable = true)



In [12]:
df.show()

+--------+----------+-----------+-------------+----------+--------------------+-----------+--------+--------+------------+
|order_id|order_date|customer_id|customer_name|product_id|        product_name|   category|quantity|   price|total_amount|
+--------+----------+-----------+-------------+----------+--------------------+-----------+--------+--------+------------+
|    1001|2025-01-05|       C001| Nguyen Van A|     P1001|       iPhone 15 Pro|Electronics|       1|28990000|    28990000|
|    1002|2025-01-05|       C002|   Tran Thi B|     P2001|      Macbook Air M2|Electronics|       1|26990000|    26990000|
|    1003|2025-01-06|       C003|     Le Van C|     P3001|         Samsung S24|Electronics|       2|19990000|    39980000|
|    1004|2025-01-06|       C004|   Pham Thi D|     P4001|       AirPods Pro 2|Accessories|       1| 5990000|     5990000|
|    1005|2025-01-07|       C005|  Hoang Van E|     P5001|Logitech MX Master 3|Accessories|       1| 2490000|     2490000|
|    1006|2025-0

In [13]:
df.write \
    .format('iceberg') \
    .saveAsTable('iceberg.test.data_demo')

                                                                                

In [14]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.test.data_demo')
df.show()

+--------+----------+-----------+-------------+----------+--------------------+-----------+--------+--------+------------+
|order_id|order_date|customer_id|customer_name|product_id|        product_name|   category|quantity|   price|total_amount|
+--------+----------+-----------+-------------+----------+--------------------+-----------+--------+--------+------------+
|    1001|2025-01-05|       C001| Nguyen Van A|     P1001|       iPhone 15 Pro|Electronics|       1|28990000|    28990000|
|    1002|2025-01-05|       C002|   Tran Thi B|     P2001|      Macbook Air M2|Electronics|       1|26990000|    26990000|
|    1003|2025-01-06|       C003|     Le Van C|     P3001|         Samsung S24|Electronics|       2|19990000|    39980000|
|    1004|2025-01-06|       C004|   Pham Thi D|     P4001|       AirPods Pro 2|Accessories|       1| 5990000|     5990000|
|    1005|2025-01-07|       C005|  Hoang Van E|     P5001|Logitech MX Master 3|Accessories|       1| 2490000|     2490000|
|    1006|2025-0

In [None]:
# spark = SparkSession.builder \
#     .appName("test_notebook") \
#     .config("spark.cores.max", "1") \
#     .config("spark.executor.memory", "2g") \
#     .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
#     .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
#     .config("spark.sql.catalog.iceberg.type", "hive") \
#     .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
#     .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
#     .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
#     .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
#     .config("spark.spark.sql.catalog.iceberg.fs.s3a.endpoint", "http://minio:9000") \
#     .config("spark.sql.catalog.iceberg.fs.s3a.access.key", "minioadmin") \
#     .config("spark.sql.catalog.iceberg.fs.s3a.secret.key", "minioadmin") \
#     .config("spark.sql.catalog.iceberg.fs.s3a.path.style.access", "true") \
#     .getOrCreate()

# spark.sparkContext.setLogLevel('ERROR')

In [15]:
spark

In [4]:
# hadoop_conf = spark._jsc.hadoopConfiguration()
# hadoop_conf.set("fs.s3a.endpoint", "http://minio:9000")  # endpoint MinIO
# hadoop_conf.set("fs.s3a.access.key", "minioadmin")
# hadoop_conf.set("fs.s3a.secret.key", "minioadmin")
# hadoop_conf.set("fs.s3a.path.style.access", "true")      # bắt buộc với MinIO
# hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

In [5]:
import pyspark
print(pyspark.__version__)  # version Spark
print(spark.version)        # version Spark session

# Kiểm tra Hadoop version
hadoop_version = spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()
print("Hadoop version:", hadoop_version)

3.5.7
3.5.7
Hadoop version: 3.3.4


In [4]:
df = spark.read.csv("s3a://linh-test/test.csv", header=True, inferSchema=True)
df.show()

+---+------------+---+-----------+------+
| id|        name|age|       city|salary|
+---+------------+---+-----------+------+
|  1|Nguyen Van A| 30|      Hanoi|   500|
|  2|  Tran Thi B| 25|Ho Chi Minh|   600|
|  3|    Le Van C| 28|    Da Nang|   550|
|  4|  Pham Thi D| 32|      Hanoi|   700|
|  5| Hoang Van E| 27|    Can Tho|   480|
+---+------------+---+-----------+------+



In [16]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()

                                                                                

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [None]:
spark.sql('create namespace iceberg.test')

In [None]:
for k, v in spark.sparkContext.getConf().getAll():
  print(k, v)

In [8]:
spark.sql('show namespaces in iceberg').show()

+---------+
|namespace|
+---------+
|  default|
|     test|
+---------+



In [10]:
spark.sql('show tables in iceberg.test').show()

+---------+---------------+-----------+
|namespace|      tableName|isTemporary|
+---------+---------------+-----------+
|     test|test_iceberg_v1|      false|
+---------+---------------+-----------+



In [15]:
spark.sql("""
CREATE TABLE IF NOT EXISTS iceberg.test.test_iceberg (
    name STRING,
    age INT
)
""")

DataFrame[]

In [17]:
df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.test.test_iceberg')

                                                                                

In [None]:
spark.sql('show tables from iceberg.test').show()

In [5]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.test.test_iceberg')
df.show()

                                                                                

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [6]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.test.test_iceberg_v1')
df.show()

+-------+---+
|   name|age|
+-------+---+
|    Bob| 25|
|Charlie| 40|
|  Diana| 22|
+-------+---+



In [16]:
spark.stop()