In [2]:
Silver_Path = "Files/Silver/Sales_Data/Sales_Data.parquet"

StatementMeta(, 4d35c2b2-62f0-40de-8d87-16bc7ee76792, 4, Finished, Available, Finished)

In [3]:
Gold_df = spark.read.parquet(Silver_Path)
display(Gold_df)
Gold_df.count()

StatementMeta(, 4d35c2b2-62f0-40de-8d87-16bc7ee76792, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 212b7387-4e68-40cb-89ac-46ca219b3f74)

100000

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Step 1: dim_date
dim_date = Gold_df.select("SALES_DATE").distinct() \
    .withColumn("DATE_ID", row_number().over(Window.orderBy("SALES_DATE"))) \
    .withColumn("DAY", dayofmonth("SALES_DATE")) \
    .withColumn("MONTH", month("SALES_DATE")) \
    .withColumn("MONTH_NAME", date_format("SALES_DATE", "MMMM")) \
    .withColumn("QUARTER", quarter("SALES_DATE")) \
    .withColumn("YEAR", year("SALES_DATE")) \
    .withColumn("WEEK_OF_YEAR", weekofyear("SALES_DATE"))

# Step 2: dim_product
dim_product = Gold_df.select("PRODUCT_NAME", "CATEGORY", "SUB_CATEGORY").distinct() \
    .withColumn("PRODUCT_ID", monotonically_increasing_id())

# Step 3: dim_customer
dim_customer = Gold_df.select("CUSTOMER_NAME", "CUSTOMER_TYPE").distinct() \
    .withColumn("CUSTOMER_ID", monotonically_increasing_id())

# Step 4: dim_region
dim_region = Gold_df.select("REGION").distinct() \
    .withColumn("REGION_ID", monotonically_increasing_id())

# Step 5: Lookup tables
product_lookup = dim_product.select("PRODUCT_ID", "PRODUCT_NAME", "CATEGORY", "SUB_CATEGORY")
customer_lookup = dim_customer.select("CUSTOMER_ID", "CUSTOMER_NAME", "CUSTOMER_TYPE")
date_lookup = dim_date.select("DATE_ID", "SALES_DATE")
region_lookup = dim_region.select("REGION_ID", "REGION")

# Step 6: fact_sales
fact_sales = Gold_df \
    .join(product_lookup, ["PRODUCT_NAME", "CATEGORY", "SUB_CATEGORY"], "left") \
    .join(customer_lookup, ["CUSTOMER_NAME", "CUSTOMER_TYPE"], "left") \
    .join(date_lookup, ["SALES_DATE"], "left") \
    .join(region_lookup, ["REGION"], "left") \
    .select(
        col("SALES_ID"),
        col("DATE_ID"),
        col("PRODUCT_ID"),
        col("CUSTOMER_ID"),
        col("REGION_ID"),
        col("QUANTITY_SOLD"),
        col("UNIT_PRICE"),
        col("DISCOUNT"),
        col("TOTAL_AMOUNT"),
        col("LAST_UPDATED_DATE"),
        col("NET_AMOUNT"),
        col("UNITS_AFTER_DISCOUNT")
    )


StatementMeta(, 4d35c2b2-62f0-40de-8d87-16bc7ee76792, 6, Finished, Available, Finished)

In [5]:
dim_product.write.mode("overwrite").format("delta").save("Files/Gold/dim_product")
dim_customer.write.mode("overwrite").format("delta").save("Files/Gold/dim_customer")
dim_region.write.mode("overwrite").format("delta").save("Files/Gold/dim_region")
dim_date.write.mode("overwrite").format("delta").save("Files/Gold/dim_date")
fact_sales.write.mode("overwrite").format("delta").save("Files/Gold/fact_sales")

StatementMeta(, 4d35c2b2-62f0-40de-8d87-16bc7ee76792, 7, Finished, Available, Finished)

In [6]:
# Base path
base_path = "Files/Gold"

# List all folders inside Silver
folders = [f for f in mssparkutils.fs.ls(base_path) if f.isDir]

for folder in folders:
    folder_path = folder.path.rstrip("/")                 # e.g., Files/Silver/Sales_Data
    folder_name = folder_path.split("/")[-1]              # e.g., Sales_Data
    target_file = f"{folder_path}/{folder_name}.parquet"  # e.g., Files/Silver/Sales_Data/Sales_Data.parquet

    # List all files in the folder
    files = mssparkutils.fs.ls(folder_path)

    # Correct way: check the filename only, not the full path
    part_files = [f.path for f in files if f.path.endswith(".parquet") and f.path.split("/")[-1].startswith("part-")]

    if part_files:
        part_file = part_files[0]  # Take first (should be only one if coalesced)

        # Remove existing renamed file if it exists
        if mssparkutils.fs.exists(target_file):
            mssparkutils.fs.rm(target_file, recurse=True)

        # Move and rename the part file
        mssparkutils.fs.mv(part_file, target_file, overwrite=True)

        print(f"✔ Renamed: {part_file} → {target_file}")
    else:
        print(f"⚠ No 'part-' parquet file found in folder: {folder_path}")


StatementMeta(, 4d35c2b2-62f0-40de-8d87-16bc7ee76792, 8, Finished, Available, Finished)

✔ Renamed: abfss://64128d75-b094-4d75-9e0f-4c1465377deb@onelake.dfs.fabric.microsoft.com/f326ebb9-1430-40eb-9309-5e15245fc72b/Files/Gold/dim_customer/part-00000-e9d1ace3-52c0-4382-8038-e2a811bb8ecb-c000.snappy.parquet → abfss://64128d75-b094-4d75-9e0f-4c1465377deb@onelake.dfs.fabric.microsoft.com/f326ebb9-1430-40eb-9309-5e15245fc72b/Files/Gold/dim_customer/dim_customer.parquet
✔ Renamed: abfss://64128d75-b094-4d75-9e0f-4c1465377deb@onelake.dfs.fabric.microsoft.com/f326ebb9-1430-40eb-9309-5e15245fc72b/Files/Gold/dim_date/part-00000-11dcfd90-bf65-4684-9514-749a454d8be0-c000.snappy.parquet → abfss://64128d75-b094-4d75-9e0f-4c1465377deb@onelake.dfs.fabric.microsoft.com/f326ebb9-1430-40eb-9309-5e15245fc72b/Files/Gold/dim_date/dim_date.parquet
✔ Renamed: abfss://64128d75-b094-4d75-9e0f-4c1465377deb@onelake.dfs.fabric.microsoft.com/f326ebb9-1430-40eb-9309-5e15245fc72b/Files/Gold/dim_product/part-00000-f1f2d9ac-1743-4eab-966c-41c50f60aa0a-c000.snappy.parquet → abfss://64128d75-b094-4d75-9e0f-4