In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("fact_category_product") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/15 07:37:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_product = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.dm_product') \
    .select(
        'product_id',
        'category_id',
        'name',
        'created_date'
    )

df_category = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.dm_category') \
    .select(
        'category_id',
        'category_name'
    )
df_product_price = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_price') \
    .select(
        'product_id',
        'price',
        'original_price',
        'discount',
        'discount_rate'
    )

df_product_sale = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_sale') \
    .select(
        'product_id',
        'quantity_sold'
    )

df_product_inventory = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_inventory') \
    .select (
        'product_id',
        'stock_qty'
    )
        

In [4]:
df = df_product.join(df_category, df_product['category_id'] == df_category['category_id'], 'inner') \
    .drop(df_category['category_id']) \
    .join(df_product_price, df_product['product_id'] == df_product_price['product_id'], 'inner') \
    .drop(df_product_price['product_id']) \
    .join(df_product_sale, df_product['product_id'] == df_product_sale['product_id'], 'inner') \
    .drop(df_product_sale['product_id']) \
    .join(df_product_inventory, df_product['product_id'] == df_product_inventory['product_id'], 'inner') \
    .drop(df_product_inventory['product_id'])
df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- created_date: date (nullable = true)
 |-- category_name: string (nullable = true)
 |-- price: long (nullable = true)
 |-- original_price: long (nullable = true)
 |-- discount: long (nullable = true)
 |-- discount_rate: float (nullable = true)
 |-- quantity_sold: long (nullable = true)
 |-- stock_qty: long (nullable = true)



In [5]:
group_column = ['category_id', 'category_name']
df = df.groupBy(*group_column) \
    .agg(
        F.count(col('product_id')).alias('total_product'),
        F.sum(col('quantity_sold')).alias('total_quantity_sold'),
        (F.sum(col('price') * col('quantity_sold'))).alias('revenue'),
        F.round(F.avg(col('price')), 2).alias('avg_price'),
        F.round(F.sum(col('quantity_sold')) / F.count(col('product_id')), 2).alias('avg_units_sold_per_product')
    )
df.printSchema()

root
 |-- category_id: string (nullable = true)
 |-- category_name: string (nullable = true)
 |-- total_product: long (nullable = false)
 |-- total_quantity_sold: long (nullable = true)
 |-- revenue: long (nullable = true)
 |-- avg_price: double (nullable = true)
 |-- avg_units_sold_per_product: double (nullable = true)



In [6]:
# df.show()

                                                                                

+-----------+--------------------+-------------+-------------------+------------+----------+--------------------------+
|category_id|       category_name|total_product|total_quantity_sold|     revenue| avg_price|avg_units_sold_per_product|
+-----------+--------------------+-------------+-------------------+------------+----------+--------------------------+
|       1686|      Giày - Dép nam|           40|               4342|   317689500|  233645.0|                    108.55|
|       1801|Máy Ảnh - Máy Qua...|           38|               9093|  3741985740| 530177.37|                    239.29|
|       8322|       Nhà Sách Tiki|           40|             277324| 29276532050| 107510.75|                    6933.1|
|       1789|Điện Thoại - Máy ...|           39|              60105|344029371000|6169179.49|                   1541.15|
|       1815|Thiết Bị Số - Phụ...|           40|              53660| 14204155736| 243749.98|                    1341.5|
|       1520|  Làm Đẹp - Sức Khỏe|      

In [7]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.gold.fact_category_product')

In [8]:
spark.stop()