#Import necessary modules

In [0]:
from pyspark.sql.functions import col,monotonically_increasing_id
from delta.tables import DeltaTable

# Read Clean Data and Fich Realative Columns

In [0]:
df_src = spark.sql('''
SELECT DISTINCT(Order_ID) AS Order_ID , Order_Date, Ship_Date FROM parquet.`abfss://silver@globalsalestorage.dfs.core.windows.net/clean_e_commerce_data`
''')

# Initial and Incremental

In [0]:
# incremental for orders
if spark.catalog.tableExists('ecom_catalog.gold.dim_order'):
    df_sink = spark.sql('''
    SELECT DISTINCT(Order_ID) AS Order_ID,
           Order_Date,
           Ship_Date,
           dim_order_key
    FROM ecom_catalog.gold.dim_order
    ''')
else:
    # initial (empty) placeholder for orders
    df_sink = spark.sql('''
    SELECT DISTINCT Order_ID,
           Order_Date,
           Ship_Date,
           1 AS dim_order_key
    FROM parquet.`abfss://silver@globalsalestorage.dfs.core.windows.net/clean_e_commerce_data`
    WHERE 1=0
    ''')

# Join Source and Sink

In [0]:
df_filtering = df_src.join(df_sink, df_src["Order_ID"] == df_sink["Order_ID"],'left').select(df_sink["dim_order_key"],df_src["Order_ID"], df_src["Order_Date"], df_src["Ship_Date"])

dim_order_key,Order_ID,Order_Date,Ship_Date
1,17164fb9,2023-01-03,2023-01-13
2,3d878a9e,2024-04-13,2024-04-17
3,4656c3db,2023-04-05,2023-04-05
4,49b0e110,2023-07-19,2023-07-28
5,4d8ea3fb,2024-11-03,2024-11-07
6,544d77ad,2024-06-27,2024-07-05
7,661cabad,2025-07-03,2025-07-03
8,7b840add,2024-05-11,2024-05-23
9,b3d7e510,2024-09-23,2024-10-03
10,c9ec05fa,2025-03-13,2025-03-13


# Filtering Old and New Rows

### Filtering Old Rows
 

In [0]:
df_filter_old = df_filtering.filter(col("dim_order_key").isNotNull())

dim_order_key,Order_ID,Order_Date,Ship_Date
1,17164fb9,2023-01-03,2023-01-13
2,3d878a9e,2024-04-13,2024-04-17
3,4656c3db,2023-04-05,2023-04-05
4,49b0e110,2023-07-19,2023-07-28
5,4d8ea3fb,2024-11-03,2024-11-07
6,544d77ad,2024-06-27,2024-07-05
7,661cabad,2025-07-03,2025-07-03
8,7b840add,2024-05-11,2024-05-23
9,b3d7e510,2024-09-23,2024-10-03
10,c9ec05fa,2025-03-13,2025-03-13


### Filtering New Rows

In [0]:
df_filter_new = df_filtering.filter(col("dim_order_key").isNull()).select(df_filtering["Order_ID"], df_filtering["Order_Date"], df_filtering["Ship_Date"])

Order_ID,Order_Date,Ship_Date


## Create Surrogate Key

In [0]:
if not spark.catalog.tableExists('ecom_catalog.gold.dim_order'):
    max_value = 0
else:
    max_value_df = spark.sql('''
    SELECT MAX(dim_order_key) AS max_value FROM ecom_catalog.gold.dim_order
    ''')
    max_value = max_value_df.collect()[0][0]

In [0]:
df_filter_new = df_filter_new.withColumn("dim_order_key", monotonically_increasing_id()+max_value+1)
df_filter_new = df_filter_new.select(col("dim_order_key"),col("Order_ID"),col("Order_Date"),col("Ship_Date"))

In [0]:
df_final = df_filter_old.union(df_filter_new)



# SCD Type-1 (UPSERT)

In [0]:
# Check if the target table exists
if spark.catalog.tableExists('ecom_catalog.gold.dim_order'):
    # Load the existing Delta table
    delta_tbl = DeltaTable.forPath(
        spark, 
        "abfss://gold@globalsalestorage.dfs.core.windows.net/dim_order"
    )

    # Perform MERGE (upsert)
    (
        delta_tbl.alias("t")
        .merge(
            df_final.alias("s"),
            "t.dim_order_key = s.dim_order_key"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

else:
    # Initial load (table creation)
    (
        df_final.write
        .format("delta")
        .mode("overwrite")
        .option("path", "abfss://gold@globalsalestorage.dfs.core.windows.net/dim_order")
        .saveAsTable("ecom_catalog.gold.dim_order")
    )
