### Business Requirement:
We have SuperStore Sample retail data and our stakeholder wants to track the below business metrics.

In [0]:
dbutils.widgets.dropdown("Time_Period","Weekly",["Weekly", "Monthly"])

In [0]:
from datetime import date, datetime, timedelta
from pyspark.sql.functions import *

TimePeriod = dbutils.widgets.get("Time_Period")
print(TimePeriod)
today = date.today()

# Last week start date and end date
if TimePeriod == "Weekly":
    start_date = today - timedelta(days=today.weekday(), weeks=1) - timedelta(days=1)
    end_date = start_date + timedelta(days=6)

# Last month start date and end date
else:
    end_date = today.replace(day=1) - timedelta(days=1)
    start_date = today.replace(day=1) - timedelta(days=end_date.day)

print(start_date, end_date)

Weekly
2025-07-13 2025-07-19


In [0]:
df=spark.read.option("header",True).option("inferSchema", True).csv("/FileStore/tables/superstore.csv")
display(df)


ID,Order_ID,Order_Date,Ship_Date,Ship_Mode,Customer_ID,Customer_Name,Segment,Country,City,State,Postal_Code,Region,Product_ID,Category,Sub_Category,Product,Sales,Quantity,Discount,Profit,User_ID,State_ID,Order_Status
1,ORD-2D0139AA,2025-07-31,2025-08-05,Second Class,CUST-0FECF7,Brian Yang,Home Office,India,North Judithbury,South Dakota,29757,East,PROD-E0B9C6C7,Furniture,Tables,Tables Begin,739.11,9,0.0,146.07,USER-1520,STATE-130,Completed
2,ORD-CAC6E9CC,2025-10-06,2025-10-16,First Class,CUST-4D611C,Lance Hoffman,Home Office,Germany,Johnsonland,Pennsylvania,29158,West,PROD-AACA948E,Technology,Accessories,Accessories Rate,718.86,9,0.3,52.89,USER-5557,STATE-928,Completed
3,ORD-B4961EF7,2025-09-29,2025-10-04,Second Class,CUST-022189,Abigail Shaffer,Corporate,United Kingdom,Robinsonshire,Vermont,36964,South,PROD-9C79DEF9,Furniture,Bookcases,Bookcases Knowledge,111.19,7,0.0,15.54,USER-6635,STATE-718,Cancelled
4,ORD-8351C7AD,2025-08-23,2025-08-25,Same Day,CUST-8248CE,Brent Abbott,Home Office,United States,Ericmouth,Michigan,13897,South,PROD-D1EB51DD,Furniture,Bookcases,Bookcases Everything,831.11,10,0.2,129.21,USER-2139,STATE-146,Pending
5,ORD-6AA69AE0,2025-12-01,2025-12-03,Same Day,CUST-14BB99,Sandra Montgomery,Consumer,Canada,Herrerafurt,Arizona,10829,East,PROD-9BAB7F31,Office Supplies,Labels,Labels High,639.33,6,0.1,82.02,USER-4432,STATE-786,Cancelled
6,ORD-FDD3EAB9,2025-04-01,2025-04-05,Second Class,CUST-724300,Dylan Miller,Home Office,India,Lake Chad,Oregon,87174,West,PROD-A9048D61,Office Supplies,Labels,Labels Trial,277.25,9,0.1,55.18,USER-1916,STATE-334,Completed
7,ORD-C707ACFA,2025-09-23,2025-09-25,Second Class,CUST-A30667,Daniel Adams,Corporate,Australia,Ramirezstad,Connecticut,21820,South,PROD-D12BF617,Technology,Copiers,Copiers Raise,220.5,8,0.3,41.86,USER-8517,STATE-246,Cancelled
8,ORD-4EF71C4B,2025-09-25,2025-10-04,Standard Class,CUST-1FDA41,Devin Schaefer,Home Office,United Kingdom,New Jessica,California,21918,West,PROD-9EF2C53D,Technology,Machines,Machines Onto,898.84,7,0.2,75.38,USER-3266,STATE-621,Processing
9,ORD-98648101,2025-01-26,2025-01-29,Second Class,CUST-33F932,Renee Morales,Consumer,India,Meganton,Iowa,30522,East,PROD-0F28C9B9,Technology,Machines,Machines Development,600.44,7,0.3,83.63,USER-8668,STATE-641,Cancelled
10,ORD-B574D47B,2025-02-24,2025-02-26,Standard Class,CUST-987D13,Maria Thomas,Home Office,South Africa,Jasonfort,Oregon,52357,East,PROD-37D7A9A2,Technology,Copiers,Copiers Employee,120.44,7,0.1,17.72,USER-5315,STATE-612,Pending


In [0]:
df.createOrReplaceTempView("SuperStore")

### 1. How many total number of customers we have?

In [0]:
%sql
SELECT COUNT(DISTINCT Customer_ID) FROM SuperStore

count(DISTINCT Customer_ID)
1000


In [0]:
display(spark.sql(f"""
            SELECT COUNT(DISTINCT Customer_ID)
            FROM SuperStore
            WHERE ORDER_DATE BETWEEN '{start_date}' AND '{end_date}'"""))

count(DISTINCT Customer_ID)
21


### 2.What is the total number of orders we have?

In [0]:
%sql
SELECT COUNT(DISTINCT Order_ID) FROM SuperStore

count(DISTINCT Order_ID)
1000


In [0]:
display(spark.sql(f"""
            SELECT COUNT(DISTINCT Order_ID)
            FROM SuperStore
            WHERE ORDER_DATE BETWEEN '{start_date}' AND '{end_date}'"""))

count(DISTINCT Order_ID)
21


### Total Number of Sales and Profit

In [0]:
%sql
SELECT SUM(Sales), SUM(Profit)
FROM SuperStore

sum(Sales),sum(Profit)
492157.0299999993,73407.59000000014


In [0]:
%sql
SELECT Country, ROUND(SUM(Sales),2) AS Total_Sales
FROM SuperStore
GROUP By Country
ORDER By Total_Sales

Country,Total_Sales
Australia,40228.9
Germany,41397.86
United States,41690.81
South Africa,43267.52
United Kingdom,47623.48
India,50709.71
Japan,54276.78
France,54843.2
Canada,55950.87
Brazil,62167.9


In [0]:
%sql
SELECT Country, Region, ROUND(SUM(Sales),2) AS Max_Profit
FROM SuperStore
GROUP BY 1,2 
ORDER BY 3 DESC

Country,Region,Max_Profit
France,West,19092.67
Brazil,East,18845.37
Brazil,South,17800.55
Canada,East,17532.17
India,East,17293.79
Japan,Central,16124.68
Canada,South,14882.75
France,East,14078.96
United Kingdom,West,13910.18
Japan,South,13858.56


Databricks visualization. Run in Databricks to view.

### TOP Sales Category Product

In [0]:
%sql
SELECT Category, ROUND(SUM(Sales),2) AS Total_Sales
FROM SuperStore
GROUP BY 1
ORDER BY 2 DESC

Category,Total_Sales
Office Supplies,175267.08
Technology,158774.57
Furniture,158115.38


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT Category, Sub_Category, ROUND(SUM(Sales),2) AS Total_Sales
FROM SuperStore
GROUP BY 1,2
ORDER BY 3 DESC
LIMIT 10

Category,Sub_Category,Total_Sales
Office Supplies,Labels,46200.48
Office Supplies,Art,45668.0
Technology,Copiers,44883.63
Technology,Accessories,44025.8
Office Supplies,Paper,42033.61
Furniture,Bookcases,41625.98
Office Supplies,Binders,41364.99
Furniture,Tables,40255.82
Furniture,Chairs,38521.31
Furniture,Furnishings,37712.27


Databricks visualization. Run in Databricks to view.

### Most Ordered Products

In [0]:
%sql
-- SELECT Product, SUM(Quantity) AS Total_Quantity
-- FROM SuperStore
-- GROUP By 1
-- ORDER By 2 DESC

SELECT
  Total_Quantity,
  ARRAY_AGG(Product) AS Products  --- can also use collect_set()
FROM (
  SELECT Product, SUM(Quantity) AS Total_Quantity
  FROM SuperStore
  GROUP BY Product
) sub
GROUP BY Total_Quantity
ORDER BY Total_Quantity DESC;

Total_Quantity,Products
21,List(Copiers Raise)
20,List(Machines I)
18,List(Paper Computer)
17,"List(Furnishings Eye, Furnishings Available, Copiers Dinner, Phones School)"
16,List(Machines Us)
15,"List(Labels High, Furnishings Hand, Accessories Live)"
14,"List(Bookcases Build, Chairs Shoulder)"
13,"List(Phones Agree, Bookcases Open)"
12,"List(Binders Consumer, Bookcases Impact, Binders Fast, Paper Realize, Art Cell)"
11,"List(Art Might, Phones Land)"


Databricks visualization. Run in Databricks to view.

### TOP Customers based on Sales and City

In [0]:
%sql
SELECT Customer_ID, CUstomer_Name, City, SUM(Sales) AS Total_Sales
FROM SUperStore
GROUP BY Customer_ID, CUstomer_Name, City
ORDER BY Total_Sales DESC

Customer_ID,CUstomer_Name,City,Total_Sales
CUST-032848,Gregg Weaver,Port Heather,997.98
CUST-D6A006,Mary Miller,New Amanda,996.98
CUST-0F1EC6,Jessica Fowler,West Kevinberg,995.61
CUST-235DF4,Melissa Rodriguez,Michelleton,995.41
CUST-990AD3,Veronica Silva,Cabrerastad,992.56
CUST-36EEF7,Joseph Johnson,Kimberlybury,992.47
CUST-E95393,Matthew Fox,Lake Devinborough,991.92
CUST-D96984,Joseph Blankenship,East Nicholasberg,991.36
CUST-DE3DF3,Laura Mckinney,South Andrewland,990.71
CUST-40C4E4,Elizabeth Morrison,Manuelland,990.33
