#Import necessary modules

In [0]:
from pyspark.sql.functions import col,monotonically_increasing_id
from delta.tables import DeltaTable

# Read Clean Data and Fich Realative Columns

In [0]:
df_src = spark.sql('''
SELECT
  DISTINCT Product_ID,
  Product_Name,
  Category,
  Sub_Category
FROM parquet.`abfss://silver@globalsalestorage.dfs.core.windows.net/clean_e_commerce_data`
''')

# Initial and Incremental

In [0]:
#incremental
if spark.catalog.tableExists('ecom_catalog.gold.dim_product'):
    df_sink = spark.sql('''
    SELECT DISTINCT(Product_ID) AS Product_ID, Product_Name, Category, Sub_Category, dim_product_key
    FROM ecom_catalog.gold.dim_product
    ''')
else:
#initial
    df_sink = spark.sql('''
    SELECT DISTINCT Product_ID, Product_Name, Category, Sub_Category, 1 AS dim_product_key
    FROM parquet.`abfss://silver@globalsalestorage.dfs.core.windows.net/clean_e_commerce_data`
    WHERE 1=0
    ''')

# Join Source and Sink

In [0]:
df_filtering = df_src.join(df_sink, df_src["Product_ID"] == df_sink["Product_ID"], 'left') \
    .select(
        df_sink["dim_product_key"],
        df_src["Product_ID"],
        df_src["Product_Name"],
        df_src["Category"],
        df_src["Sub_Category"]
    )

dim_product_key,Product_ID,Product_Name,Category,Sub_Category
1,e5ae7a04,Headphones,Clothing,Headphones
2,146bc562,Laptop,Clothing,Laptop
3,b67fae0b,Sofa,Electronics,Sofa
4,b76ca15c,Shirt,Electronics,Shirt
5,a52b609e,Mobile,Furniture,Mobile
6,5d434938,Headphones,Electronics,Headphones
7,3ace982b,Mobile,Electronics,Mobile
8,ed933f8d,Sofa,Clothing,Sofa
9,4565302c,Shirt,Furniture,Shirt
10,27e87c5f,Laptop,Furniture,Laptop


# Filtering Old and New Rows

### Filtering Old Rows
 

In [0]:
df_filter_old = df_filtering.filter(col("dim_product_key").isNotNull())


dim_product_key,Product_ID,Product_Name,Category,Sub_Category
1,e5ae7a04,Headphones,Clothing,Headphones
2,146bc562,Laptop,Clothing,Laptop
3,b67fae0b,Sofa,Electronics,Sofa
4,b76ca15c,Shirt,Electronics,Shirt
5,a52b609e,Mobile,Furniture,Mobile
6,5d434938,Headphones,Electronics,Headphones
7,3ace982b,Mobile,Electronics,Mobile
8,ed933f8d,Sofa,Clothing,Sofa
9,4565302c,Shirt,Furniture,Shirt
10,27e87c5f,Laptop,Furniture,Laptop


### Filtering New Rows

In [0]:
df_filter_new = df_filtering.filter(col("dim_product_key").isNull()).select(
    df_filtering["Product_ID"],
    df_filtering["Product_Name"],
    df_filtering["Category"],
    df_filtering["Sub_Category"]
)

Product_ID,Product_Name,Category,Sub_Category


## Create Surrogate Key

In [0]:
if not spark.catalog.tableExists('ecom_catalog.gold.dim_product'):
    max_value = 0
else:
    max_value_df = spark.sql('''
    SELECT MAX(dim_product_key) AS max_value FROM ecom_catalog.gold.dim_product
    ''')
    max_value = max_value_df.collect()[0][0]

In [0]:
df_filter_new = df_filter_new.withColumn("dim_product_key", monotonically_increasing_id() + max_value + 1)
df_filter_new = df_filter_new.select(
    col("dim_product_key"),
    col("Product_ID"),
    col("Product_Name"),
    col("Category"),
    col("Sub_Category")
)

In [0]:
df_final = df_filter_old.union(df_filter_new)



# SCD Type-1 (UPSERT)

In [0]:
from delta.tables import DeltaTable

# Check if the target table exists
if spark.catalog.tableExists('ecom_catalog.gold.dim_product'):
    # Load the existing Delta table
    delta_tbl = DeltaTable.forPath(
        spark, 
        "abfss://gold@globalsalestorage.dfs.core.windows.net/dim_product"
    )

    # Perform MERGE (upsert)
    (
        delta_tbl.alias("t")
        .merge(
            df_final.alias("s"),
            "t.dim_product_key = s.dim_product_key"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    # Initial load (table creation)
    (
        df_final.write
        .format("delta")
        .mode("overwrite")
        .option("path", "abfss://gold@globalsalestorage.dfs.core.windows.net/dim_product")
        .saveAsTable("ecom_catalog.gold.dim_product")
    )