In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("sub_product_sale") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/14 17:04:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.bronze.product_details')
# df.printSchema()
# df.show()

In [4]:
df = df.select(
    col('id').alias('product_id'),
    col('quantity_sold_value').alias("quantity_sold"),
    'day_ago_created',
    'ngay_cap_nhat'
).withColumn("ngay_cap_nhat_date", F.to_date('ngay_cap_nhat'))
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- quantity_sold: string (nullable = true)
 |-- day_ago_created: string (nullable = true)
 |-- ngay_cap_nhat: timestamp (nullable = true)
 |-- ngay_cap_nhat_date: date (nullable = true)



In [5]:
max_date = df.agg(F.max("ngay_cap_nhat_date")).first()[0]

df = df.filter(col('ngay_cap_nhat_date') == max_date)

                                                                                

In [6]:
df = df.withColumn("day_ago_created", col("day_ago_created").cast("int"))
df = df.withColumn("created_date", F.date_sub(F.col("ngay_cap_nhat"), F.col("day_ago_created")))
df = df.withColumnRenamed('ngay_cap_nhat', 'snapshot_date')
df = df.drop('ngay_cap_nhat_date', 'day_ago_created')

In [7]:
df = df.dropDuplicates()

In [8]:
df = df.withColumn("quantity_sold", col("quantity_sold").cast("long"))

In [9]:
df = df.filter(col("product_id").isNotNull())

In [10]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

In [11]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- quantity_sold: long (nullable = true)
 |-- snapshot_date: timestamp (nullable = true)
 |-- created_date: date (nullable = true)
 |-- ngay_cap_nhat: timestamp (nullable = false)



In [12]:
df.show()

                                                                                

+----------+-------------+--------------------+------------+--------------------+
|product_id|quantity_sold|       snapshot_date|created_date|       ngay_cap_nhat|
+----------+-------------+--------------------+------------+--------------------+
| 277544996|            4|2025-12-12 17:23:...|  2025-03-22|2025-12-14 17:04:...|
|  49291897|          216|2025-12-12 17:23:...|  2020-03-03|2025-12-14 17:04:...|
| 275959921|           63|2025-12-12 17:23:...|  2024-08-21|2025-12-14 17:04:...|
|  98162499|            3|2025-12-12 17:23:...|  2021-05-16|2025-12-14 17:04:...|
| 276388548|         3427|2025-12-12 17:23:...|  2024-10-16|2025-12-14 17:04:...|
| 272404951|           29|2025-12-12 17:23:...|  2023-10-03|2025-12-14 17:04:...|
| 277377913|            1|2025-12-12 17:23:...|  2025-02-20|2025-12-14 17:04:...|
| 124966188|          671|2025-12-12 17:23:...|  2021-09-21|2025-12-14 17:04:...|
| 275251193|           51|2025-12-12 17:23:...|  2024-06-12|2025-12-14 17:04:...|
| 204237269|    

In [13]:
df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.silver.sub_product_sale')

In [14]:
spark.stop()