In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder \
    .appName("dim_product") \
    .config("spark.cores.max", "1") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.catalog.iceberg.warehouse", "s3a://warehouse/") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/29 03:00:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sql("""
CREATE OR REPLACE TEMP VIEW product_stg AS
SELECT
    product_id,
    product_category_name,
    product_name,
    product_name_lenght,
    product_description_lenght,
    product_photos_qty,
    product_weight_g,
    product_height_cm,
    product_length_cm,
    product_width_cm,
    sha2(
        concat_ws(
            '||',
            coalesce(cast(product_category_name as string), '__NULL__'),
            coalesce(cast(product_name as string), '__NULL__'),
            coalesce(cast(product_name_lenght as string), '__NULL__'),
            coalesce(cast(product_description_lenght as string), '__NULL__'),
            coalesce(cast(product_photos_qty as string), '__NULL__'),
            coalesce(cast(product_weight_g as string), '__NULL__'),
            coalesce(cast(product_height_cm as string), '__NULL__'),
            coalesce(cast(product_length_cm as string), '__NULL__'),
            coalesce(cast(product_width_cm as string), '__NULL__')
        ),
        256
    ) AS src_hash
FROM iceberg.silver.product_clean
""")


DataFrame[]

In [4]:
spark.sql("""
MERGE INTO iceberg.gold.dim_product t
USING product_stg s
ON t.product_id = s.product_id
AND t.is_current = true

WHEN MATCHED AND
sha2(
    concat_ws(
        '||',
        coalesce(cast(t.product_category_name as string), '__NULL__'),
        coalesce(cast(t.product_name as string), '__NULL__'),
        coalesce(cast(t.product_name_lenght as string), '__NULL__'),
        coalesce(cast(t.product_description_lenght as string), '__NULL__'),
        coalesce(cast(t.product_photos_qty as string), '__NULL__'),
        coalesce(cast(t.product_weight_g as string), '__NULL__'),
        coalesce(cast(t.product_height_cm as string), '__NULL__'),
        coalesce(cast(t.product_length_cm as string), '__NULL__'),
        coalesce(cast(t.product_width_cm as string), '__NULL__')
    ),
    256
) <> s.src_hash
THEN UPDATE SET
    t.effective_to = current_timestamp(),
    t.is_current = false
""")


                                                                                

DataFrame[]

In [5]:
spark.sql("""
MERGE INTO iceberg.gold.dim_product t
USING product_stg s
ON t.product_id = s.product_id
AND t.is_current = true

WHEN NOT MATCHED THEN
  INSERT (
    product_sk,
    product_id,
    product_category_name,
    product_name,
    product_name_lenght,
    product_description_lenght,
    product_photos_qty,
    product_weight_g,
    product_height_cm,
    product_length_cm,
    product_width_cm,
    effective_from,
    effective_to,
    is_current
  )
  VALUES (
    monotonically_increasing_id(),
    s.product_id,
    s.product_category_name,
    s.product_name,
    s.product_name_lenght,
    s.product_description_lenght,
    s.product_photos_qty,
    s.product_weight_g,
    s.product_height_cm,
    s.product_length_cm,
    s.product_width_cm,
    current_timestamp(),
    TIMESTAMP '9999-12-31',
    true
  )
""")


                                                                                

DataFrame[]

In [6]:
spark.stop()