In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder \
    .appName("fact") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/14 20:44:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df_product = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.dm_product')

In [5]:
df_product_sale = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_sale')

In [6]:
df_product_price = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_price')

In [7]:
df_product_inventory = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_product_inventory')

In [8]:
df_review = spark.read \
    .format('iceberg') \
    .load('iceberg.silver.sub_review')

In [9]:
df = df_product.join(df_product_sale, on = 'product_id', how='inner') \
    .join(df_product_price, on = 'product_id', how='inner') \
    .join(df_product_inventory, on = 'product_id', how='inner') \
    .join(df_review, on = 'product_id', how='inner')

In [10]:
df.select(
    df_product_sale['snapshot_date'],
    df_product['product_id'],
    df_product['category_id'],
    df_product_price['price'],
    df_product_price['discount_rate'],
    df_product_sale['quantity_sold'],
    df_review['rating'],
    df_product_inventory['stock_qty'],
    df_product['day_ago_created'],
).printSchema()

root
 |-- snapshot_date: timestamp (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- price: long (nullable = true)
 |-- discount_rate: float (nullable = true)
 |-- quantity_sold: long (nullable = true)
 |-- rating: integer (nullable = true)
 |-- stock_qty: long (nullable = true)
 |-- day_ago_created: integer (nullable = true)



In [None]:
group_column = ['snapshot_date', 'product_id', 'category_id']
df.groupBy(*group_column) \
    .agg(

In [None]:
df = df.withColumn("ngay_cap_nhat", current_timestamp())

df.write \
    .format('iceberg') \
    .mode('overwrite') \
    .saveAsTable('iceberg.gold.dim_seller')

                                                                                

In [4]:
spark.stop()