In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable


In [0]:
%run /Workspace/FMCG_Project/01_setup/02_schema_utilities

In [0]:
dbutils.widgets.text("catalog", "fmcg", "catalog")
dbutils.widgets.text("data_source", "products", "data_source")

catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

print(f"{catalog} - {data_source}")

fmcg - products


In [0]:
base_path = f"/Volumes/fmcg/bronze/souce_data/chaild_company/full_load/{data_source}/*.csv"
base_path

'/Volumes/fmcg/bronze/souce_data/chaild_company/full_load/products/*.csv'

#### bronze layer


In [0]:
df_bronze = spark.read.format("csv")\
                    .option("inferSchema", True)\
                    .option("header", True)\
                    .load(base_path)\
                    .withColumn("read_timestamp", F.current_timestamp())\
                    .select("*", "_metadata.file_name", "_metadata.file_size")

In [0]:
df_bronze.write.format("delta")\
                .mode("overwrite")\
                .option("enableChangeDataFeed", True)\
                .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

#### Silver layer

In [0]:
df_silver = spark.read.table(f"{catalog}.{bronze_schema}.{data_source}")

In [0]:
display(
    df_silver.groupBy("product_id").count().filter(F.col("count") > 1)
)

product_id,count
25891101,2
25891102,2


In [0]:
print(f"Records count before drop duplicates: {df_silver.count()}")
df_silver = df_silver.dropDuplicates(subset=["product_id"])
print(f"Records count after drop duplicates: {df_silver.count()}")

Records count before drop duplicates: 20
Records count after drop duplicates: 18


In [0]:
df_silver = df_silver.withColumn(
    "category",
    F.when(F.col("category").isNull(), None)
    .otherwise(F.initcap(F.col("category")))
)

In [0]:
df_silver = (df_silver
                .withColumn("category", F.regexp_replace(F.col("category"), "(?i)protien", "Protein"))
                .withColumn("product_name", F.regexp_replace(F.col("product_name"), "(?i)protien", "Protein"))
            )

In [0]:
df_silver = df_silver.withColumn("variant", F.regexp_extract(F.col("product_name"), r"\((.*?)\)", 1))

## Standardize the product information to compatable with parent product table

In [0]:
df_silver = df_silver.withColumn(
                    "division",
                    F.when(F.col("category") == "Energy Bars",        "Nutrition Bars")
                        .when(F.col("category") == "Protein Bars",       "Nutrition Bars")
                        .when(F.col("category") == "Granola & Cereals",  "Breakfast Foods")
                        .when(F.col("category") == "Recovery Dairy",     "Dairy & Recovery")
                        .when(F.col("category") == "Healthy Snacks",     "Healthy Snacks")
                        .when(F.col("category") == "Electrolyte Mix",    "Hydration & Electrolytes")
                        .otherwise("Other")
                )
    
# Create a surrogate key for product_id
df_silver = df_silver.withColumn(
    "product_code",
    F.sha2(F.col("product_name").cast("string"), 256)
)


# Clean product_id: keep only numeric IDs, else set to 999999

df_silver = (df_silver.withColumn(
            "product_id",
            F.when(F.col("product_id").cast("string").rlike("^[0-9]+$"), F.col("product_id").cast("string"))
            .otherwise(F.lit(999999).cast("string"))
        )
        .withColumnRenamed("product_name", "product")
        )

df_silver = df_silver.select("product_code", "division", "category", "product", "variant", "product_id", "read_timestamp", "file_name", "file_size")


In [0]:
df_silver.write.format("delta")\
                .mode("overwrite")\
                .option("enableChangeDataFeed", True)\
                .option("mergeSchema", True)\
                .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

#### Gold Layer

In [0]:
df_gold = spark.read.table(f"{catalog}.{silver_schema}.{data_source}")

df_gold = df_gold.select("product_code", "division", "category", "product", "variant")

In [0]:
df_gold.write.format("delta")\
            .mode("overwrite")\
            .option("enableChangeDataFeed", True)\
            .option("mergeSchema", True)\
            .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

### merge operation

In [0]:
delta_trg_products = DeltaTable.forName(spark, f"{catalog}.{gold_schema}.dim_{data_source}")

df_src_products = spark.read.table(f"{catalog}.{gold_schema}.sb_dim_{data_source}")


delta_trg_products.alias("trg").merge(
    df_src_products.alias("src"),
    condition="trg.product_code == src.product_code"
).whenMatchedUpdateAll()\
.whenNotMatchedInsertAll()\
.execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]