In [0]:
# import section
from pyspark.sql import functions as f
from pyspark.sql import types as t
from delta.tables import DeltaTable
import logging
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('fmcg').getOrCreate()

In [0]:
%run /Workspace/apache-spark/databricks-project-fmcg-sports/utils/utilities

In [0]:
# Read product data from s3
product_bronze = (
    spark.read.format('csv')
    .option('header','true')
    .option('inferSchema', 'true')
    .load(f'{s3_bucket}/products/products.csv')
    .withColumn('readtime', f.current_timestamp())
)

#
spark.sql('DROP TABLE IF EXISTS fmcg.bronze.sb_products')
# create bronze table as it is 
product_bronze.write.mode('overwrite').saveAsTable(f'{catalog}.{bronze_schema}.sb_products')

#product_bronze.show(truncate=False)

In [0]:
product_bronze.summary().display()

In [0]:
# check if there any duplicates
duplicate_prod = product_bronze.groupby("product_id").count().filter(f.col('count') > 1 ).select("product_id")

product_bronze.filter(f.col("product_id").isin(duplicate_prod))

# product id is duplicate so drop it
product_silver = product_bronze.dropDuplicates()
product_silver.count()

# category name should be in cammel case and product XYZ123 is invalid record
product_silver = product_silver.withColumns({
    'category': f.regexp_replace(f.initcap(f.col('category')),'(?i)protien', 'Protine'),
    'product_name': f.regexp_replace(f.col('product_name'), '(?i)protien', 'Protine'),
    'varient':  f.regexp_extract(f.col("product_name"), r"\(([^)]+)\)", 1),
    'product_key': f.sha2(f.col("product_name"), 256),
    # product id should be number if something else then replace with 999999
})

# change the product code if it is not number to 9999999
product_silver = product_silver.withColumn(
    'product_id', f.when(
        f.col('product_id').cast(t.StringType()).rlike(r'^[0-9]+$'), f.col('product_id')
    ).otherwise('9999999')
).withColumnRenamed('product_name','product')

product_silver.display()

In [0]:
product_silver = product_silver.select("product_key", 'category', 'product', 'varient', "readtime")

In [0]:
# write it silver schema
product_silver.write\
    .format('delta')\
    .option('mergeSchema','true')\
    .option('delta.enableChangeDataFeed', 'true')\
    .mode('overwrite')\
    .saveAsTable(f'{catalog}.{silver_schema}.product')