In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Retail Analytics").getOrCreate()

orders_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/orders1.csv")
customers_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/customers1.csv")
products_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/products1.csv")
orders_df.printSchema()
customers_df.printSchema()
products_df.printSchema()
# Save as Delta
orders_df.write.format("delta").option("mergeSchema", "true").mode("overwrite").save("/delta/orders")
customers_df.write.format("delta").mode("overwrite").save("/delta/customers")
products_df.write.format("delta").option("mergeSchema", "true").mode("overwrite").save("/delta/products")


root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- ProductID: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- SignupDate: date (nullable = true)

root
 |-- ProductID: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Stock: integer (nullable = true)
 |-- ReorderLevel: integer (nullable = true)



In [0]:

# 2. Write SQL to get the total revenue per Product.

spark.sql("CREATE TABLE IF NOT EXISTS orders USING DELTA LOCATION '/delta/orders'")
spark.sql("CREATE TABLE IF NOT EXISTS products USING DELTA LOCATION '/delta/products'")

spark.sql("""
SELECT 
    ProductID, 
    SUM(Quantity * Price) AS TotalRevenue
FROM orders
WHERE Status = 'Delivered'
GROUP BY ProductID
ORDER BY TotalRevenue DESC
""").show()

# 3. Join Orders + Customers to find revenue by Region.
spark.sql("CREATE TABLE IF NOT EXISTS customers USING DELTA LOCATION '/delta/customers'")

spark.sql("""
SELECT 
    c.Region,
    SUM(o.Quantity * o.Price) AS RevenueByRegion
FROM orders o
JOIN customers c ON o.CustomerID = c.CustomerID
WHERE o.Status = 'Delivered'
GROUP BY c.Region
ORDER BY RevenueByRegion DESC
""").show()



+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|       50000|
|    P1003|       30000|
+---------+------------+

+------+---------------+
|Region|RevenueByRegion|
+------+---------------+
| North|         125000|
|  West|          30000|
+------+---------------+



In [0]:
# 4. Update the Status of Pending orders to 'Cancelled'.
from delta.tables import DeltaTable

orders_delta = DeltaTable.forPath(spark, "/delta/orders")

orders_delta.update(
    condition="Status = 'Pending'",
    set={"Status": "'Cancelled'"}
)

# 5. Merge a new return record into Orders.
from pyspark.sql import Row

new_return_df = spark.createDataFrame([
    Row(OrderID=3006, CustomerID="C003", ProductID="P1002", Quantity=1, Price=50000, OrderDate="2024-05-06", Status="Returned")
])

orders_delta.alias("target").merge(
    source=new_return_df.alias("source"),
    condition="target.OrderID = source.OrderID",
).whenNotMatchedInsertAll().execute()


In [0]:
# DLT Pipeline
# 6. Create raw → cleaned → aggregated tables:
# Clean: Remove rows with NULLs
# Aggregated: Total revenue per Category
import dlt
from pyspark.sql.functions import col, expr

# Raw Ingest Layer
@dlt.table(name="orders_raw")
def load_orders_raw():
    return spark.read.format("csv").option("header", True).option("inferSchema", True).load("file:/Workspace/Shared/orders1.csv")

@dlt.table(name="products_raw")
def load_products_raw():
    return spark.read.format("csv").option("header", True).option("inferSchema", True).load("file:/Workspace/Shared/products1.csv")

# Cleaned Layer: Remove NULLs
@dlt.table(name="orders_cleaned")
def clean_orders():
    return dlt.read("orders_raw").dropna()

# Aggregated Layer: Revenue per Category
@dlt.table(name="revenue_by_category")
def aggregate_revenue():
    orders = dlt.read("orders_cleaned")
    products = dlt.read("products_raw")
    joined = orders.join(products, on="ProductID", how="inner")
    return joined.groupBy("Category").agg(expr("SUM(Quantity * Price) AS TotalRevenue"))


Name,Type
Category,string
TotalRevenue,bigint


In [0]:
# Time Travel
# 7. View data before the Status update.

spark.sql("DESCRIBE HISTORY delta.`/delta/orders`").show(truncate=False)

# 8. Restore to an older version of the orders table.

old_df = spark.read.format("delta").option("versionAsOf", 2).load("/delta/orders")

old_df.write.format("delta").mode("overwrite").save("/delta/orders")


+-------+-------------------+----------------+----------------------------------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:

# 9. Run VACUUM after changing default retention.

spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

spark.sql("VACUUM delta.`/delta/orders` RETAIN 0 HOURS")



DataFrame[path: string]

In [0]:
# 10. Quantity > 0 , Price > 0 , OrderDate is not null

# 11. Use when-otherwise to create a new column: OrderType = "Return" if Status =='Returned'
import dlt
from pyspark.sql.functions import when

@dlt.table(name="orders_cleaned")
@dlt.expect("valid_quantity", "Quantity > 0")
@dlt.expect("valid_price", "Price > 0")
@dlt.expect("order_date_not_null", "OrderDate IS NOT NULL")
def orders_cleaned():
    df = dlt.read("orders_raw").dropna()
    
    # Add derived column
    df = df.withColumn(
        "OrderType",
        when(df["Status"] == "Returned", "Return").otherwise("Normal")
    )
    
    return df


Name,Type
OrderID,int
CustomerID,string
ProductID,string
Quantity,int
Price,int
OrderDate,date
Status,string
OrderType,string
