Scenario 1: Inventory Alerting System

In [0]:
# Retail Inventory & Supply Chain Intelligence Project
# Scenario 1: Inventory Alerting System

# Step 1: Load the inventory dataset
from pyspark.sql.functions import col

inventory_df = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/default/nithyashree/inventory_supply.csv")
inventory_df = inventory_df.withColumn("LastRestocked", col("LastRestocked").cast("date"))
inventory_df.show()

# Step 2: Create a new column NeedsReorder = StockQty < ReorderLevel
from pyspark.sql.functions import expr

inventory_df = inventory_df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))
inventory_df.show()

# Step 3: Create a view of all items that need restocking
inventory_df.createOrReplaceTempView("inventory")
needs_restocking = spark.sql("SELECT * FROM inventory WHERE NeedsReorder = true")
needs_restocking.show()

# Step 4: Highlight warehouses with more than 2 such items
warehouse_alerts = needs_restocking.groupBy("Warehouse").count().filter("count > 2")
warehouse_alerts.show()


+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+

+------+------------+-----------+----------+--------+------------+-------------+---------+

Scenario 2: Supplier Price Optimization

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/default/nithyashree/inventory_supply.csv")
df.show()
df.printSchema()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+

root
 |-- ItemID: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Ca

In [0]:
from pyspark.sql.functions import avg

df.groupBy("Supplier").agg(avg("UnitPrice").alias("AvgPrice")).show()

+---------+--------+
| Supplier|AvgPrice|
+---------+--------+
|   AVTech| 30000.0|
| FreezeIt| 25000.0|
|TechWorld| 70000.0|
|  ChairCo|  6000.0|
|PrintFast|  8000.0|
+---------+--------+



In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, when

# Calculate average price per category
category_avg = df.groupBy("Category").agg(avg("UnitPrice").alias("CategoryAvg"))

# Join to get category average
df_with_avg = df.join(category_avg, on="Category")

# Filter items below category average
below_avg_items = df_with_avg.filter(col("UnitPrice") < col("CategoryAvg"))
below_avg_items.select("ItemName", "Category", "UnitPrice", "CategoryAvg", "Supplier").show()

+--------+-----------+---------+-----------+---------+
|ItemName|   Category|UnitPrice|CategoryAvg| Supplier|
+--------+-----------+---------+-----------+---------+
|  LED TV|Electronics|    30000|    36000.0|   AVTech|
| Printer|Electronics|     8000|    36000.0|PrintFast|
+--------+-----------+---------+-----------+---------+



In [0]:
from pyspark.sql.functions import count, sum as _sum

# Count total and below-average items per supplier
supplier_stats = df_with_avg.withColumn(
    "BelowMarket", when(col("UnitPrice") < col("CategoryAvg"), 1).otherwise(0)
).groupBy("Supplier").agg(
    count("*").alias("TotalItems"),
    _sum("BelowMarket").alias("BelowMarketItems")
)

# Tag Good Deal
supplier_tagged = supplier_stats.withColumn(
    "SupplierTag",
    when((col("BelowMarketItems") / col("TotalItems")) > 0.5, "Good Deal").otherwise("Average")
)

supplier_tagged.show()

+---------+----------+----------------+-----------+
| Supplier|TotalItems|BelowMarketItems|SupplierTag|
+---------+----------+----------------+-----------+
|   AVTech|         1|               1|  Good Deal|
| FreezeIt|         1|               0|    Average|
|TechWorld|         1|               0|    Average|
|  ChairCo|         1|               0|    Average|
|PrintFast|         1|               1|  Good Deal|
+---------+----------+----------------+-----------+



Scenario 3: Cost Forecasting 

In [0]:
from pyspark.sql.functions import col

df_cost = df.withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))
df_cost.select("ItemName", "StockQty", "UnitPrice", "TotalStockValue").show()

+------------+--------+---------+---------------+
|    ItemName|StockQty|UnitPrice|TotalStockValue|
+------------+--------+---------+---------------+
|      LED TV|      50|    30000|        1500000|
|      Laptop|      10|    70000|         700000|
|Office Chair|      40|     6000|         240000|
|Refrigerator|       5|    25000|         125000|
|     Printer|       3|     8000|          24000|
+------------+--------+---------+---------------+



In [0]:
df_top3 = df_cost.orderBy(col("TotalStockValue").desc()).limit(3)
df_top3.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|         240000|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+---------------+



In [0]:
df_cost.write.mode("overwrite").partitionBy("Warehouse").parquet("/Volumes/workspace/default/nithyashree/exported_parquet/inventory_cost_forecast")

Scenario 4: Warehouse Utilization 

In [0]:
df.groupBy("Warehouse").count().withColumnRenamed("count", "ItemCount").show()

+----------+---------+
| Warehouse|ItemCount|
+----------+---------+
|WarehouseB|        2|
|WarehouseA|        2|
|WarehouseC|        1|
+----------+---------+



In [0]:
df.groupBy("Warehouse", "Category") \
  .avg("StockQty") \
  .withColumnRenamed("avg(StockQty)", "AvgStock") \
  .show()

+----------+-----------+--------+
| Warehouse|   Category|AvgStock|
+----------+-----------+--------+
|WarehouseB|Electronics|     6.5|
|WarehouseA|Electronics|    50.0|
|WarehouseA|  Furniture|    40.0|
|WarehouseC| Appliances|     5.0|
+----------+-----------+--------+



In [0]:
from pyspark.sql.functions import sum

df_stock_per_warehouse = df.groupBy("Warehouse").agg(sum("StockQty").alias("TotalStock"))
df_stock_per_warehouse.filter(col("TotalStock") < 100).show()

+----------+----------+
| Warehouse|TotalStock|
+----------+----------+
|WarehouseB|        13|
|WarehouseA|        90|
|WarehouseC|         5|
+----------+----------+



Scenario 5: Delta Audit Trail

In [0]:
df.write.format("delta") \
  .mode("overwrite") \
  .save("/Volumes/workspace/default/nithyashree/retail_inventory")

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col

delta_table = DeltaTable.forPath(spark, "/Volumes/workspace/default/nithyashree/retail_inventory")

delta_table.update(
    condition=col("ItemName") == "Laptop",
    set={"StockQty": "20"}
)

DataFrame[num_affected_rows: bigint]

In [0]:
delta_table.delete(condition=col("StockQty") == 0)

DataFrame[num_affected_rows: bigint]

In [0]:
# Describe the history
spark.sql("DESCRIBE HISTORY delta.`/Volumes/workspace/default/nithyashree/retail_inventory`").show(truncate=False)

# To query a previous version, e.g., version 0:
spark.read.format("delta") \
  .option("versionAsOf", 0) \
  .load("/Volumes/workspace/default/nithyashree/retail_inventory") \
  .show()

+-------+-------------------+----------------+--------------------------+---------+------------------------------------------------------------------------------+----+--------+------------------------+-----------+-----------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--------------------------------------------------+
|version|timestamp          |userId          |userName                  |operation|operationParameters                                                           |job |notebook|clusterId               |readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                

 Scenario 6: Alerts from Restock Logs (Join Task)

In [0]:
restock_df = spark.read.option("header", True).option("inferSchema", True) \
    .csv("/Volumes/workspace/default/nithyashree/restock_logs.csv")
display(restock_df)

ItemID,RestockDate,QuantityAdded
I002,2024-04-20,10
I005,2024-04-22,5
I001,2024-04-25,20


In [0]:
from pyspark.sql.functions import col, expr

# Load current inventory from Delta
inventory_df = spark.read.format("delta").load("/Volumes/workspace/default/nithyashree/retail_inventory")

# Join with restock logs and calculate new stock
updated_df = inventory_df.alias("inv").join(
    restock_df.alias("res"),
    col("inv.ItemID") == col("res.ItemID"),
    how="left"
).withColumn(
    "NewStockQty",
    col("inv.StockQty") + expr("coalesce(res.QuantityAdded, 0)")
).withColumn(
    "RestockedRecently",
    col("res.QuantityAdded").isNotNull()
)

# Show only relevant columns
display(
    updated_df.select(
        col("inv.ItemID").alias("ItemID"),
        "ItemName",
        "inv.StockQty",
        "res.QuantityAdded",
        "NewStockQty",
        "RestockedRecently"
    )
)


ItemID,ItemName,StockQty,QuantityAdded,NewStockQty,RestockedRecently
I002,Laptop,20,10.0,30,True
I001,LED TV,50,20.0,70,True
I003,Office Chair,40,,40,False
I004,Refrigerator,5,,5,False
I005,Printer,3,5.0,8,True


In [0]:
df_restock = spark.read.option("header", True).option("inferSchema", True).csv("/Volumes/workspace/default/nithyashree/restock_logs.csv")
df_restock.show()


+------+-----------+-------------+
|ItemID|RestockDate|QuantityAdded|
+------+-----------+-------------+
|  I002| 2024-04-20|           10|
|  I005| 2024-04-22|            5|
|  I001| 2024-04-25|           20|
+------+-----------+-------------+



In [0]:
df_restock.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/nithyashree/restock_logs_delta")


In [0]:
%sql

MERGE INTO delta.`/Volumes/workspace/default/nithyashree/retail_inventory` AS target
USING (
  SELECT 
    inv.ItemID AS ItemID,
    inv.ItemName,
    inv.Category,
    inv.Warehouse,
    inv.ReorderLevel,
    inv.LastRestocked,
    inv.UnitPrice,
    inv.Supplier,
    inv.StockQty + COALESCE(res.QuantityAdded, 0) AS StockQty
  FROM delta.`/Volumes/workspace/default/nithyashree/retail_inventory` AS inv
  LEFT JOIN delta.`/Volumes/workspace/default/nithyashree/restock_logs_delta` AS res
  ON inv.ItemID = res.ItemID
) AS source
ON target.ItemID = source.ItemID

WHEN MATCHED THEN UPDATE SET
  target.StockQty = source.StockQty;


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
5,5,0,0


Scenario 7: SQL Views.

In [0]:
%sql

CREATE OR REPLACE TEMP VIEW inventory_summary AS
SELECT 
  ItemName,
  Category,
  StockQty,
  CASE WHEN StockQty < ReorderLevel THEN true ELSE false END AS NeedsReorder,
  (StockQty * UnitPrice) AS TotalStockValue
FROM delta.`/Volumes/workspace/default/nithyashree/retail_inventory`;


In [0]:
%sql

CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
SELECT 
  Supplier,
  ROUND(AVG(UnitPrice), 2) AS AvgPrice
FROM delta.`/Volumes/workspace/default/nithyashree/retail_inventory`
GROUP BY Supplier
ORDER BY AvgPrice DESC;


In [0]:
%sql
-- Inventory Summary
SELECT * FROM inventory_summary;

-- Supplier Leaderboard
SELECT * FROM supplier_leaderboard;


Supplier,AvgPrice
TechWorld,70000.0
AVTech,30000.0
FreezeIt,25000.0
PrintFast,8000.0
ChairCo,6000.0


Scenario 8: Advanced Filtering

In [0]:
from pyspark.sql.functions import when

df_filtered = df.withColumn(
    "StockStatus",
    when(df.StockQty > 2 * df.ReorderLevel, "Overstocked")
    .when(df.StockQty < df.ReorderLevel, "LowStock")
    .otherwise("Normal")
)

display(df_filtered.select("ItemName", "StockQty", "ReorderLevel", "StockStatus"))


ItemName,StockQty,ReorderLevel,StockStatus
LED TV,50,20,Overstocked
Laptop,10,15,LowStock
Office Chair,40,10,Overstocked
Refrigerator,5,10,LowStock
Printer,3,5,LowStock


In [0]:
# Using .filter()
low_stock_filter = df_filtered.filter(df_filtered.StockStatus == "LowStock")
display(low_stock_filter.select("ItemName", "StockQty", "ReorderLevel", "StockStatus"))


ItemName,StockQty,ReorderLevel,StockStatus
Laptop,10,15,LowStock
Refrigerator,5,10,LowStock
Printer,3,5,LowStock


Scenario 9: Feature Engineering

In [0]:
from pyspark.sql.functions import month

df = df.withColumn("RestockMonth", month("LastRestocked"))
display(df.select("ItemName", "LastRestocked", "RestockMonth"))


ItemName,LastRestocked,RestockMonth
LED TV,2024-03-15,3
Laptop,2024-04-01,4
Office Chair,2024-03-25,3
Refrigerator,2024-02-20,2
Printer,2024-03-30,3


In [0]:
from pyspark.sql.functions import datediff, current_date

df = df.withColumn("StockAge", datediff(current_date(), "LastRestocked"))
display(df.select("ItemName", "LastRestocked", "StockAge"))


ItemName,LastRestocked,StockAge
LED TV,2024-03-15,461
Laptop,2024-04-01,444
Office Chair,2024-03-25,451
Refrigerator,2024-02-20,485
Printer,2024-03-30,446


In [0]:
from pyspark.sql.functions import when

df = df.withColumn(
    "StockCategory",
    when(df.StockAge <= 30, "New")
    .when(df.StockAge <= 90, "Moderate")
    .otherwise("Stale")
)

display(df.select("ItemName", "StockAge", "StockCategory"))


ItemName,StockAge,StockCategory
LED TV,461,Stale
Laptop,444,Stale
Office Chair,451,Stale
Refrigerator,485,Stale
Printer,446,Stale


Scenario 10: Export Options

In [0]:
df.write.mode("overwrite") \
  .option("header", True) \
  .csv("/Volumes/workspace/default/nithyashree/export/inventory/for_analysts")


In [0]:
df.write.mode("overwrite") \
  .json("/Volumes/workspace/default/nithyashree/export/inventory/integration_json")


In [0]:
df.write.format("delta") \
  .mode("overwrite") \
  .partitionBy("Warehouse") \
  .save("/Volumes/workspace/default/nithyashree/export/inventory/pipeline_delta")
