In [0]:
# Databricks notebook source
dbutils.fs.mount(
  source = "wasbs://silver@sanilayanalytics.blob.core.windows.net",
  mount_point = "/mnt/superstore_silver",
  extra_configs = {"fs.azure.account.key.sanilayanalytics.blob.core.windows.net":"STORAGEKEY"})


Out[3]: True

In [0]:
# List files/folders inside the Silver mount point
dbutils.fs.ls("/mnt/superstore_silver")


Out[5]: [FileInfo(path='dbfs:/mnt/superstore_silver/global_superstore_silver/', name='global_superstore_silver/', size=0, modificationTime=0)]

In [0]:
# Define the path to the folder or file inside your Silver mount
silver_path = "/mnt/superstore_silver/global_superstore_silver/"

# Read the Parquet files into a Spark DataFrame
df_silver = spark.read.parquet(silver_path)

# Show a sample of the data to verify
display(df_silver.limit(10))


product_name,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,city,state,country,postal_code,market,region,category,sub-category,sales,quantity,discount,profit,shipping_cost,order_priority,product_key
Plantronics CS510 - Over-the-Head monaural Wireless Headset System,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,United States,10024.0,US,East,Technology,Accessories,2309.65,7,0.0,762.18,933.57,Critical,2751
Novimex Executive Leather Armchair- Black,26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,Australia,,APAC,Oceania,Furniture,Chairs,3709.4,9,0.1,-288.77,923.63,Critical,2526
Nokia Smart Phone- with Caller ID,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,Australia,,APAC,Oceania,Technology,Phones,5175.17,9,0.1,919.97,915.49,Medium,2503
Motorola Smart Phone- Cordless,13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,Germany,,EU,Central,Technology,Phones,2892.51,5,0.1,-96.54,910.16,Medium,2415
Sharp Wireless Fax- High-Speed,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,Senegal,,Africa,Africa,Technology,Copiers,2832.96,8,0.0,311.52,903.04,Critical,3159
Samsung Smart Phone- with Caller ID,22732,IN-2013-42360,2013-06-28,2013-07-01,Second Class,JM-15655,Jim Mitchum,Corporate,Sydney,New South Wales,Australia,,APAC,Oceania,Technology,Phones,2862.68,5,0.1,763.28,897.35,Critical,2992
Novimex Executive Leather Armchair- Adjustable,30570,IN-2011-81826,2011-11-07,2011-11-09,First Class,TS-21340,Toby Swindell,Consumer,Porirua,Wellington,New Zealand,,APAC,Oceania,Furniture,Chairs,1822.08,4,0.0,564.84,894.77,Critical,2525
Chromcraft Conference Table- Fully Assembled,31192,IN-2012-86369,2012-04-14,2012-04-18,Standard Class,MB-18085,Mick Brown,Consumer,Hamilton,Waikato,New Zealand,,APAC,Oceania,Furniture,Tables,5244.84,6,0.0,996.48,878.38,High,922
Fellowes PB500 Electric Punch Plastic Comb Binding Machine with Manual Bind,40155,CA-2014-135909,2014-10-14,2014-10-21,Standard Class,JW-15220,Jane Waco,Corporate,Sacramento,California,United States,95823.0,US,West,Office Supplies,Binders,5083.96,5,0.2,1906.49,867.69,Low,1430
Chromcraft Bull-Nose Wood Oval Conference Tables & Bases,40936,CA-2012-116638,2012-01-28,2012-01-31,Second Class,JH-15985,Joseph Holt,Consumer,Concord,North Carolina,United States,28027.0,US,South,Furniture,Tables,4297.64,13,0.4,-1862.31,865.74,Critical,911


In [0]:
%sql
-- Step 1: Create dim_customer with Recency, Frequency, Monetary values and R, F, M scores
CREATE OR REPLACE TABLE dim_customer AS
WITH CTE AS (
    SELECT customer_id,
           customer_name,
           MAX(order_date) AS max_date,
           DATEDIFF(
               DAY,
               MAX(order_date),
               (SELECT MAX(order_date) FROM parquet.`/mnt/superstore_silver/global_superstore_silver/`)
           ) AS recency,
           COUNT(order_id) AS frequency,
           SUM(profit) AS monetary
    FROM parquet.`/mnt/superstore_silver/global_superstore_silver/`
    GROUP BY customer_id, customer_name
),
CTE_2 AS (
    SELECT *,
           NTILE(10) OVER (ORDER BY recency DESC) AS r_score,
           NTILE(10) OVER (ORDER BY frequency ASC) AS f_score,
           NTILE(10) OVER (ORDER BY monetary ASC) AS m_score
    FROM CTE
),
CTE_3 AS (
    SELECT *,
           (r_score + f_score + m_score) AS rfm
    FROM CTE_2
)
SELECT *,
       NTILE(10) OVER (ORDER BY rfm ASC) AS rfm_score
FROM CTE_3;

-- Step 2: Display 10 rows from the dim_customer table to verify
SELECT * FROM dim_customer LIMIT 10;


customer_id,customer_name,max_date,recency,frequency,monetary,r_score,f_score,m_score,rfm,rfm_score
KD-6270,Karen Daniels,2012-10-11,811,2,-1023.54,1,1,1,3,1
CG-2040,Catherine Glotzbach,2013-12-18,378,7,-395.91,1,1,1,3,1
DB-3120,David Bremer,2014-01-07,358,6,-242.12,1,1,1,3,1
CS-2130,Chad Sievert,2014-03-01,305,5,-645.52,1,1,1,3,1
SS-10875,Sung Shariari,2014-04-25,250,5,-707.82,1,1,1,3,1
TM-11010,Tamara Manning,2014-05-03,242,4,-270.5,1,1,1,3,1
DK-2985,Darren Koutras,2011-09-12,1206,1,-78.89,1,1,2,4,1
CT-1995,Carol Triggs,2012-11-08,783,3,-71.59,1,1,2,4,1
RC-9825,Roy Collins,2012-12-20,741,1,0.0,1,1,2,4,1
BG-1035,Barry Gonzalez,2013-05-10,600,1,-219.8,1,1,2,4,1


In [0]:
# Checking the column names for craeting Dimension and Fact tables
for col_name in df_silver.columns:
    print(col_name)


product_name
row_id
order_id
order_date
ship_date
ship_mode
customer_id
customer_name
segment
city
state
country
postal_code
market
region
category
sub-category
sales
quantity
discount
profit
shipping_cost
order_priority
product_key


In [0]:
# Select relevant columns from the already loaded Silver DataFrame (df_silver)
df_dim_product = df_silver.select(
    "product_key",
    "product_name",  # assuming it exists, else remove this line
    "category",
    "sub-category"
).dropDuplicates(["product_key"])  # Get unique products only

# Display the resulting dim_product table
display(df_dim_product.limit(10))


product_key,product_name,category,sub-category
1,"""While you Were Out"""" Message Book- One Form per Page""",Office Supplies,Paper
2,#10 Gummed Flap White Envelopes- 100/Box,Office Supplies,Envelopes
3,#10 Self-Seal White Envelopes,Office Supplies,Envelopes
4,#10 White Business Envelopes-4 1/8 x 9 1/2,Office Supplies,Envelopes
5,"#10- 4 1/8"" x 9 1/2"""" Recycled Envelopes""",Office Supplies,Envelopes
6,"#10- 4 1/8"" x 9 1/2"""" Security-Tint Envelopes""",Office Supplies,Envelopes
7,"#10-4 1/8"" x 9 1/2"""" Premium Diagonal Seam Envelopes""",Office Supplies,Envelopes
8,#6 3/4 Gummed Flap White Envelopes,Office Supplies,Envelopes
9,"1.7 Cubic Foot Compact ""Cube"""" Office Refrigerators""",Office Supplies,Appliances
10,"1/4 Fold Party Design Invitations & White Envelopes- 24 8-1/2"" X 11"""" Cards- 25 Env./Pack""",Office Supplies,Paper


In [0]:
# Count distinct product_name and product_key to verify uniqueness and completeness

# Count distinct product names
product_name_count = df_silver.select("product_name").distinct().count()

# Count distinct product keys (product_id)
product_key_count = df_silver.select("product_key").distinct().count()

# Display the counts side by side
print(f"Distinct Product Names Count: {product_name_count}")
print(f"Distinct Product Keys Count: {product_key_count}")


Distinct Product Names Count: 3788
Distinct Product Keys Count: 3788


In [0]:
# Select required columns for the fact_sales fact table from df_silver
df_fact_sales = df_silver.select(
    "row_id",
    "order_id",
    "order_date",
    "ship_date",
    "ship_mode",
    "customer_id",
    "segment",
    "city",
    "state",
    "country",
    "postal_code",
    "market",
    "region",
    "sales",
    "quantity",
    "discount",
    "profit",
    "shipping_cost",
    "order_priority",
    "product_key"
)

# Display the first 10 rows of the fact_sales DataFrame
display(df_fact_sales.limit(10))


row_id,order_id,order_date,ship_date,ship_mode,customer_id,segment,city,state,country,postal_code,market,region,sales,quantity,discount,profit,shipping_cost,order_priority,product_key
32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Consumer,New York City,New York,United States,10024.0,US,East,2309.65,7,0.0,762.18,933.57,Critical,2751
26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Corporate,Wollongong,New South Wales,Australia,,APAC,Oceania,3709.4,9,0.1,-288.77,923.63,Critical,2526
25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Consumer,Brisbane,Queensland,Australia,,APAC,Oceania,5175.17,9,0.1,919.97,915.49,Medium,2503
13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Home Office,Berlin,Berlin,Germany,,EU,Central,2892.51,5,0.1,-96.54,910.16,Medium,2415
47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Consumer,Dakar,Dakar,Senegal,,Africa,Africa,2832.96,8,0.0,311.52,903.04,Critical,3159
22732,IN-2013-42360,2013-06-28,2013-07-01,Second Class,JM-15655,Corporate,Sydney,New South Wales,Australia,,APAC,Oceania,2862.68,5,0.1,763.28,897.35,Critical,2992
30570,IN-2011-81826,2011-11-07,2011-11-09,First Class,TS-21340,Consumer,Porirua,Wellington,New Zealand,,APAC,Oceania,1822.08,4,0.0,564.84,894.77,Critical,2525
31192,IN-2012-86369,2012-04-14,2012-04-18,Standard Class,MB-18085,Consumer,Hamilton,Waikato,New Zealand,,APAC,Oceania,5244.84,6,0.0,996.48,878.38,High,922
40155,CA-2014-135909,2014-10-14,2014-10-21,Standard Class,JW-15220,Corporate,Sacramento,California,United States,95823.0,US,West,5083.96,5,0.2,1906.49,867.69,Low,1430
40936,CA-2012-116638,2012-01-28,2012-01-31,Second Class,JH-15985,Consumer,Concord,North Carolina,United States,28027.0,US,South,4297.64,13,0.4,-1862.31,865.74,Critical,911


In [0]:
# Databricks notebook source

# ================================
# Mount Gold container in Databricks
# ================================

# Problem:
# The Gold container is not mounted in DBFS, so writing files to the Gold layer using
# mount paths (e.g., /mnt/superstore_gold) fails because the mount point doesn't exist.
# Direct ABFSS paths require Spark config keys which might not be set, causing access errors.
#
# Solution:
# Mount the Gold container to a DBFS mount point. This provides a stable, consistent
# path (/mnt/superstore_gold) to read/write files without configuring keys every session.
# Mount points persist for the workspace and are accessible from all clusters in it.
#
# Run this block once in your workspace or notebook session. If already mounted, it will error,
# so you may want to handle that in production (see note below).

try:
    dbutils.fs.mount(
        source = "wasbs://gold@sanilayanalytics.blob.core.windows.net",
        mount_point = "/mnt/superstore_gold",
        extra_configs = {
            "fs.azure.account.key.sanilayanalytics.blob.core.windows.net": "STORAGEKEY"
        }
    )
    print("✅ Successfully mounted /mnt/superstore_gold")
except Exception as e:
    if "Directory already mounted" in str(e):
        print("ℹ️ Mount point /mnt/superstore_gold already exists.")
    else:
        print("❌ Mounting failed with error:", e)



ℹ️ Mount point /mnt/superstore_gold already exists.


In [0]:
# -----------------------------------
# Define Gold layer mount path and subfolders for each dimension/fact table
gold_mount_path = "/mnt/superstore_gold/"

dim_customer_path = gold_mount_path + "dim_customer"
dim_product_path = gold_mount_path + "dim_product"
fact_sales_path = gold_mount_path + "fact_sales"

# -----------------------------------
# Save dim_customer table as Parquet in Gold layer (read from Spark SQL table)
df_dim_customer = spark.table("dim_customer")
df_dim_customer.write.mode("overwrite").parquet(dim_customer_path)
print("✅ dim_customer successfully saved to Gold layer.")

# Save df_dim_product DataFrame as Parquet in Gold layer
df_dim_product.write.mode("overwrite").parquet(dim_product_path)
print("✅ dim_product successfully saved to Gold layer.")

# Save df_fact_sales DataFrame as Parquet in Gold layer
df_fact_sales.write.mode("overwrite").parquet(fact_sales_path)
print("✅ fact_sales successfully saved to Gold layer.")


✅ dim_customer successfully saved to Gold layer.
✅ dim_product successfully saved to Gold layer.
✅ fact_sales successfully saved to Gold layer.


In [0]:
# Verify by reading back a few rows from Gold layer Parquet files

print("Preview dim_customer:")
display(spark.read.parquet(dim_customer_path).limit(3))

print("Preview dim_product:")
display(spark.read.parquet(dim_product_path).limit(3))

print("Preview fact_sales:")
display(spark.read.parquet(fact_sales_path).limit(3))

Preview dim_customer:


customer_id,customer_name,max_date,recency,frequency,monetary,r_score,f_score,m_score,rfm,rfm_score
KD-6270,Karen Daniels,2012-10-11,811,2,-1023.54,1,1,1,3,1
CG-2040,Catherine Glotzbach,2013-12-18,378,7,-395.91,1,1,1,3,1
DB-3120,David Bremer,2014-01-07,358,6,-242.12,1,1,1,3,1


Preview dim_product:


product_key,product_name,category,sub-category
1,"""While you Were Out"""" Message Book- One Form per Page""",Office Supplies,Paper
2,#10 Gummed Flap White Envelopes- 100/Box,Office Supplies,Envelopes
3,#10 Self-Seal White Envelopes,Office Supplies,Envelopes


Preview fact_sales:


row_id,order_id,order_date,ship_date,ship_mode,customer_id,segment,city,state,country,postal_code,market,region,sales,quantity,discount,profit,shipping_cost,order_priority,product_key
32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Consumer,New York City,New York,United States,10024.0,US,East,2309.65,7,0.0,762.18,933.57,Critical,2751
26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Corporate,Wollongong,New South Wales,Australia,,APAC,Oceania,3709.4,9,0.1,-288.77,923.63,Critical,2526
25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Consumer,Brisbane,Queensland,Australia,,APAC,Oceania,5175.17,9,0.1,919.97,915.49,Medium,2503
