In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder \
    .appName("dm_product") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/23 18:22:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read \
    .format('iceberg') \
    .load('iceberg.bronze.product_details')
df.printSchema()
# df.show()

root
 |-- id: string (nullable = true)
 |-- sku: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- book_cover: string (nullable = true)
 |-- short_description: string (nullable = true)
 |-- price: string (nullable = true)
 |-- original_price: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- discount_rate: string (nullable = true)
 |-- rating_average: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- favourite_count: string (nullable = true)
 |-- has_ebook: string (nullable = true)
 |-- inventory_status: string (nullable = true)
 |-- inventory_type: string (nullable = true)
 |-- productset_group_name: string (nullable = true)
 |-- is_fresh: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- is_flower: string (nullable = true)
 |-- has_buynow: string (nullable = true)
 |-- is_gift_card: string (nullable = true)
 |-- salable_type: str

In [4]:
df = df.select(
    col('id').alias('product_id'),
    col('categories_id').alias('category_id'),
    'brand_id',
    col('current_seller_id').alias('seller_id'),
    'sku', 
    'name',
    'type',
    'short_description',
    'favourite_count',
    'review_count',
    'has_ebook',
    'is_fresh',
    'is_flower',
    'is_gift_card',
    'is_baby_milk',
    'is_acoholic_drink',
    'has_buynow',
    'price', 
    'original_price',
    'discount',
    'discount_rate',
    'inventory_status',
    'inventory_type',
    col("stock_item_qty").alias("stock_qty"),
    col("stock_item_min_sale_qty").alias("min_sales_qty"),
    col("stock_item_max_sale_qty").alias("max_sales_qty"),
    col('quantity_sold_value').alias("quantity_sold"),
    'data_version',
    'day_ago_created',
   'ngay_cap_nhat'
)
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- brand_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- sku: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- short_description: string (nullable = true)
 |-- favourite_count: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- has_ebook: string (nullable = true)
 |-- is_fresh: string (nullable = true)
 |-- is_flower: string (nullable = true)
 |-- is_gift_card: string (nullable = true)
 |-- is_baby_milk: string (nullable = true)
 |-- is_acoholic_drink: string (nullable = true)
 |-- has_buynow: string (nullable = true)
 |-- price: string (nullable = true)
 |-- original_price: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- discount_rate: string (nullable = true)
 |-- inventory_status: string (nullable = true)
 |-- inventory_type: string (nullable = true)
 |-- stock_qty: str

In [5]:
df = df.withColumn("day_ago_created", col("day_ago_created").cast("int"))
df = df.withColumn("created_date", F.date_sub(F.col("ngay_cap_nhat"), F.col("day_ago_created")))

In [6]:
w = (
    Window
    .partitionBy("product_id")
    .orderBy(F.col("ngay_cap_nhat").desc())
)

df = (
    df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn", "ngay_cap_nhat")
)

In [7]:
df = df.withColumn("favourite_count", col("favourite_count").cast("int")) \
    .withColumn("review_count", col("review_count").cast("int")) \
    .withColumn("has_ebook", when(col("has_ebook") == "False", False).otherwise(True)) \
    .withColumn("is_fresh", when(col("is_fresh") == "False", False).otherwise(True)) \
    .withColumn("is_flower", when(col("is_flower") == "False", False).otherwise(True)) \
    .withColumn("has_buynow", when(col("has_buynow") == "False", False).otherwise(True)) \
    .withColumn("is_gift_card", when(col("is_gift_card") == "False", False).otherwise(True)) \
    .withColumn("is_baby_milk", when(col("is_baby_milk") == "False", False).otherwise(True)) \
    .withColumn("is_acoholic_drink", when(col("is_acoholic_drink") == "False", False).otherwise(True)) \
    .withColumn("stock_qty", col("stock_qty").cast("long")) \
    .withColumn("min_sales_qty", col("min_sales_qty").cast("long")) \
    .withColumn("max_sales_qty", col("max_sales_qty").cast("long")) \
    .withColumn("price", col("price").cast("long")) \
    .withColumn("original_price", col("original_price").cast("long")) \
    .withColumn("discount", col("discount").cast("long")) \
    .withColumn("discount_rate", col("discount_rate").cast("float")) \
    .withColumn("quantity_sold", col("quantity_sold").cast("long"))
    

In [8]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

In [9]:
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- brand_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- sku: string (nullable = true)
 |-- name: string (nullable = true)
 |-- type: string (nullable = true)
 |-- short_description: string (nullable = true)
 |-- favourite_count: integer (nullable = true)
 |-- review_count: integer (nullable = true)
 |-- has_ebook: boolean (nullable = false)
 |-- is_fresh: boolean (nullable = false)
 |-- is_flower: boolean (nullable = false)
 |-- is_gift_card: boolean (nullable = false)
 |-- is_baby_milk: boolean (nullable = false)
 |-- is_acoholic_drink: boolean (nullable = false)
 |-- has_buynow: boolean (nullable = false)
 |-- price: long (nullable = true)
 |-- original_price: long (nullable = true)
 |-- discount: long (nullable = true)
 |-- discount_rate: float (nullable = true)
 |-- inventory_status: string (nullable = true)
 |-- inventory_type: string (nullable = true)
 |-- stock

In [11]:
df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.silver.dm_product')

                                                                                

In [12]:
spark.stop()