In [0]:
# import section
from pyspark.sql import functions as f
from pyspark.sql import types as t
from delta.tables import DeltaTable
import logging
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('fmcg').getOrCreate()

In [0]:
%run /Workspace/apache-spark/databricks-project-fmcg-sports/utils/utilities

In [0]:
# Read product data from s3
product_bronze = (
    spark.read.format('csv')
    .option('header','true')
    .option('inferSchema', 'true')
    .load(f'{s3_bucket}/products/products.csv')
    .withColumn('readtime', f.current_timestamp())
)

#
spark.sql('DROP TABLE IF EXISTS fmcg.bronze.sb_products')
# create bronze table as it is 
product_bronze.write.mode('overwrite').saveAsTable(f'{catalog}.{bronze_schema}.sb_products')

#product_bronze.show(truncate=False)

In [0]:
product_bronze.summary().display()

In [0]:
# check if there any duplicates
duplicate_prod = product_bronze.groupby("product_id").count().filter(f.col('count') > 1 ).select("product_id")

product_bronze.filter(f.col("product_id").isin(duplicate_prod))

# product id is duplicate so drop it
product_silver = product_bronze.dropDuplicates()
product_silver.count()

# category name should be in cammel case and product XYZ123 is invalid record
product_silver = product_silver.withColumns({
    'category': f.regexp_replace(f.initcap(f.col('category')),'(?i)protien', 'Protine'),
    'product_name': f.regexp_replace(f.col('product_name'), '(?i)protien', 'Protine'),
    'variant':  f.regexp_extract(f.col("product_name"), r"\(([^)]+)\)", 1),
    'product_key': f.sha2(f.col("product_name"), 256),
    # product id should be number if something else then replace with 999999
})

# change the product code if it is not number to 9999999
product_silver = product_silver.withColumn(
    'product_id', f.when(
        f.col('product_id').cast(t.StringType()).rlike(r'^[0-9]+$'), f.col('product_id')
    ).otherwise('9999999')
).withColumnRenamed('product_name','product')

# create the division based on the category

product_silver = product_silver.withColumn(
    "Division",
    f.when(f.col("category") == "Energy Bars", "Nutrition Bars")
    .when(f.col("category") == "Protine Bars", "Nutrition Bars")
    .when(f.col("category") == "Granola & Cereals", "Breakfast Foods")
    .when(f.col("category") == "Recovery Dairy", "Dairy & Recovery")
    .when(f.col("category") == "Healthy Snacks", "Healthy Snacks")
    .when(f.col("category") == "Electrolyte Mix", "Hydration & Electrolytes")
)

product_silver.limit(4).display()


In [0]:
# select only necessary data from product_silver table
product_silver = product_silver.select("product_id","product_key", 'Division', 'category', 'product', 'variant', "readtime")

# and write it silver schema
product_silver.write\
    .format('delta')\
    .option('mergeSchema','true')\
    .option('delta.enableChangeDataFeed', 'true')\
    .mode('overwrite')\
    .saveAsTable(f'{catalog}.{silver_schema}.product')

In [0]:
# Read the delta table
delta_table = DeltaTable.forName(spark, f'{catalog}.{gold_schema}.dim_products')

# Read the the silver table
silver_product = spark.sql("select product_key, Division, category, product, variant  from fmcg.silver.product")

# merge the table
delta_table.alias('d').merge(
    silver_product.alias('s'),
    "d.product_code = s.product_key"
).whenMatchedUpdate(
    set={
        "product_code":"s.product_key",
        "division":"s.Division",
        "category":"s.category",
        "product":"s.product",
        "variant":"s.variant"
    }
).whenNotMatchedInsert(
    values={
        "product_code":"s.product_key",
        "division":"s.Division",
        "category":"s.category",
        "product":"s.product",
        "variant":"s.variant"
    }
).execute()

In [0]:
%sql
SELECT * FROM fmcg.gold.dim_products;