In [0]:
from pyspark.sql.window import *
from pyspark.sql.functions import *
from delta.tables import DeltaTable 
from pyspark.sql.utils import AnalysisException

In [0]:
snowflake_options = {
    "sfURL": dbutils.secrets.get(scope="my_scope", key="sfURL"),
    "sfUser": dbutils.secrets.get(scope="my_scope", key="sfUser"),
    "sfPassword": dbutils.secrets.get(scope="my_scope", key="sfPassword"),
    "sfDatabase": "sales_report_db",
    "sfSchema": "my_schemma",
    "sfWarehouse": "etl_wh",
    "sfRole": "data_engineer_role",
}


In [0]:
df_cust_info=spark.read.table("workspace.silver_crm_erp.cust_info")
df_cust_az12=spark.read.table("workspace.silver_crm_erp.cust_az12")
df_loc_a101=spark.read.table("workspace.silver_crm_erp.loc_a101")
df_px_cat_g1v2=spark.read.table("workspace.silver_crm_erp.px_cat_g1v2")
df_prd_info=spark.read.table("workspace.silver_crm_erp.prd_info")
df_sales_details=spark.read.table("workspace.silver_crm_erp.sales_details")

##### **CREATE DIM_CUSTOMERS**

In [0]:
table_name="workspace.gold_crm_erp.dim_customers"
# Join silver tables using left joins on cst_key and cid
joined_df = df_cust_info.alias("ci") \
    .join(df_cust_az12.alias("ca"), col("ci.cst_key") == col("ca.cid"), "left") \
    .join(df_loc_a101.alias("la"), col("ci.cst_key") == col("la.cid"), "left")\
        
window_distinct=Window.partitionBy("ci.cst_id").orderBy("ci.cst_id")
df_with_rownum=joined_df.withColumn("row_num", row_number().over(window_distinct))
latest_raw_df=df_with_rownum.filter(col("row_num") == 1)
# Define window spec for row_number ordered by cst_id
window_spec = Window.orderBy("ci.cst_id")

# Select and transform columns according to SQL logic
dim_customers_df = latest_raw_df.select(
    row_number().over(window_spec).alias("customer_sk"),
    col("ci.cst_id").alias("customer_id"),
    col("ci.cst_key").alias("customer_number"),
    col("ci.cst_firstname").alias("first_name"),
    col("ci.cst_lastname").alias("last_name"),
    col("la.cntry").alias("country"),
    col("ci.cst_marital_status").alias("marital_status"),
    when(col("ci.cst_gndr") != 'Unknown', col("ci.cst_gndr"))
        .otherwise(coalesce(col("ca.gen"), lit('Unknown'))).alias("gender"),
    col("ca.bdate").alias("birth_date"),
    col("ci.cst_create_date").alias("created_date")
)
try:
    delta_table = DeltaTable.forName(spark, table_name)
    
    print("Table exists, performing merge...")
    # Merge (upsert)
    delta_table.alias("target").merge(
        dim_customers_df.alias("source"),
        "target.customer_id = source.customer_id"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
    print("Merge completed.")
except AnalysisException:
    print("Table does not exist, creating new Delta table...")
    dim_customers_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_name)
    print("Table created.")



##### **DIM_PRODUCTS**

In [0]:
# Define window spec for row_number ordering
window_spec = Window.orderBy(col("prd_start_dt"), col("prd_key"))

table_name="workspace.gold_crm_erp.dim_products"
# Join, filter, select, and add row_number as surrogate product_key
df_product_dim = (
    df_prd_info.alias("pn")
    .join(df_px_cat_g1v2.alias("pc"), col("pn.cat_id") == col("pc.id"), how="left")
    
    .select(
        row_number().over(window_spec).alias("product_sk"),
        col("pn.prd_id").alias("product_id"),
        col("pn.prd_key").alias("product_number"),
        col("pn.prd_nm").alias("product_name"),
        col("pn.cat_id").alias("category_id"),
        col("pc.cat").alias("category"),
        col("pc.subcat").alias("subcategory"),
        col("pc.maintenance"),
        col("pn.prd_cost").alias("cost"),
        col("pn.prd_line").alias("product_line"),
        col("pn.prd_start_dt").alias("start_date")
    )
)

try:
    delta_table=DeltaTable.forName(spark,table_name)
    print("Table Exists, Perfoming Merge....")
    delta_table.alias("target").merge(
        df_product_dim.alias("source"),
        "target.prd_id = source.product_id"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
    print("Merge Completed.")
except AnalysisException as e:
    print("Table does not exist, creating new Delta table...")
    df_product_dim.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_name)


##### **FACT_SALES_DETAILS**

In [0]:

table_name="workspace.gold_crm_erp.fact_sales"
# Join sales_details with dim_products on product key and dim_customers on customer id
df_fact_sales = (
    df_sales_details.alias("sd")
    .join(df_product_dim.alias("pr"), col("sd.sls_prd_key") == col("pr.product_number"), "left")
    .join(dim_customers_df.alias("cr"), col("sd.sls_cust_id") == col("cr.customer_id"), "left")
    .select(
        col("sd.sls_ord_num").alias("order_number"),
        col("pr.product_sk"),
        col("cr.customer_sk"),
        col("sd.sls_order_dt").alias("order_date"),
        col("sd.sls_ship_dt").alias("ship_date"),
        col("sd.sls_due_dt").alias("due_date"),
        col("sd.sls_sales").alias("sales"),
        col("sd.sls_quantity").alias("quantity"),
        col("sd.sls_price").alias("price")
    )
)
try:
    delta_table=DeltaTable.forName(spark,table_name)
    print("Table Exists, Perfoming Merge....")
    delta_table.alias("target").merge(
        df_product_dim.alias("source"),
        "target.order_number = source.order_number"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
    print("Merge Completed.")
except AnalysisException as e:
    print("Table does not exist, creating new Delta table...")
    df_product_dim.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(table_name)
