In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
spark = SparkSession.builder.appName("InventoryAlertingSystem").getOrCreate()
schema = StructType([
    StructField("ItemID", StringType(), True),
    StructField("ItemName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Warehouse", StringType(), True),
    StructField("StockQty", IntegerType(), True),
    StructField("ReorderLevel", IntegerType(), True),
    StructField("LastRestocked", DateType(), True),
    StructField("UnitPrice", IntegerType(), True),
    StructField("Supplier", StringType(), True)
])
df = spark.read.csv("file:/Workspace/Shared/inventory_supply.csv", header=True, schema=schema)


In [0]:
# 2. Create a new column NeedsReorder = StockQty < ReorderLevel .
from pyspark.sql.functions import col
df=df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))
df.show()

# 3. Create a view of all items that need restocking.
df.filter(col("NeedsReorder") == True).show()

# 4. Highlight warehouses with more than 2 such items.
from pyspark.sql.functions import count
df.groupBy("Warehouse").agg(count("*").alias("ItemsNeedingRestock")).filter(col("ItemsNeedingRestock") > 2)
df.show()


+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|        true|
+------+------------+-----------+----------+--------+------------+-------------+--------

In [0]:
# Scenario 2: Supplier Price Optimization
# 1. Group items by Supplier and compute average price.
from pyspark.sql.functions import avg

df.groupBy("Supplier").agg(avg("UnitPrice").alias("AvgPrice")).show()

# Step 2: Find suppliers who offer items below average price in their category
category_avg = df.groupBy("Category").agg(avg("UnitPrice").alias("CategoryAvg"))

df_avg = df.join(category_avg, on="Category")

below_avg_items = df_avg.filter(col("UnitPrice") < col("CategoryAvg"))

print("Items below category average price:")
below_avg_items.select("ItemID", "ItemName", "Category", "UnitPrice", "CategoryAvg", "Supplier").show()

# Step 3: Tag suppliers with Good Deal if >50% of their items are below market average

total = df.groupBy("Supplier").agg(count("*").alias("TotalItems"))

# Count below-average items per supplier
below = below_avg_items.groupBy("Supplier").agg(count("*").alias("BelowAvgItems"))
result = total.join(below, on="Supplier", how="left").fillna(0)
result = result.withColumn("GoodDeal", (col("BelowAvgItems") / col("TotalItems")) > 0.5)
result.select("Supplier", "TotalItems", "BelowAvgItems", "GoodDeal").show()


+---------+--------+
| Supplier|AvgPrice|
+---------+--------+
|   AVTech| 30000.0|
|TechWorld| 70000.0|
|PrintFast|  8000.0|
| FreezeIt| 25000.0|
|  ChairCo|  6000.0|
+---------+--------+

Items below category average price:
+------+--------+-----------+---------+-----------+---------+
|ItemID|ItemName|   Category|UnitPrice|CategoryAvg| Supplier|
+------+--------+-----------+---------+-----------+---------+
|  I001|  LED TV|Electronics|    30000|    36000.0|   AVTech|
|  I005| Printer|Electronics|     8000|    36000.0|PrintFast|
+------+--------+-----------+---------+-----------+---------+

+---------+----------+-------------+--------+
| Supplier|TotalItems|BelowAvgItems|GoodDeal|
+---------+----------+-------------+--------+
|   AVTech|         1|            1|    true|
|PrintFast|         1|            1|    true|
|TechWorld|         1|            0|   false|
| FreezeIt|         1|            0|   false|
|  ChairCo|         1|            0|   false|
+---------+----------+-----------

In [0]:
# Scenario 3: Cost Forecasting
# 1. Calculate TotalStockValue = StockQty * UnitPrice .
df=df.withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))

# 2. Identify top 3 highest-value items.
df.orderBy(col("TotalStockValue").desc()).limit(3).show()

# 3. Export the result as a Parquet file partitioned by Warehouse
df.write.mode("overwrite").partitionBy("Warehouse").parquet("path/inventory.parquet")


In [0]:
# Scenario 4: Warehouse Utilization
# Tasks:
# 1. Count items stored per warehouse.
df.groupBy("Warehouse").agg(count("*").alias("ItemCount")).show()

# 2. Average stock per category in each warehouse.
df.groupBy("Warehouse", "Category").agg(avg("StockQty").alias("AvgStock")) \
  .orderBy("Warehouse", "Category").show()

# 3. Determine underutilized warehouses ( total stock < 100)
from pyspark.sql.functions import sum,col
warehouse_stock = df.groupBy("Warehouse").agg(sum("StockQty").alias("TotalStock"))
warehouse_stock.filter(col("TotalStock") < 100).show()

+----------+---------+
| Warehouse|ItemCount|
+----------+---------+
|WarehouseA|        2|
|WarehouseC|        1|
|WarehouseB|        2|
+----------+---------+

+----------+-----------+--------+
| Warehouse|   Category|AvgStock|
+----------+-----------+--------+
|WarehouseA|Electronics|    50.0|
|WarehouseA|  Furniture|    40.0|
|WarehouseB|Electronics|     6.5|
|WarehouseC| Appliances|     5.0|
+----------+-----------+--------+

+----------+----------+
| Warehouse|TotalStock|
+----------+----------+
|WarehouseA|        90|
|WarehouseC|         5|
|WarehouseB|        13|
+----------+----------+



In [0]:
# Scenario 5: Delta Audit Trail
from delta.tables import DeltaTable
from pyspark.sql.functions import col
# 1. Save as Delta table retail_inventory .
df.write.format("delta").mode("overwrite").saveAsTable("retail_inventory")

# 2. Update stock of 'Laptop' to 20
delta_table = DeltaTable.forName(spark, "retail_inventory")

delta_table.update(
    condition="ItemName = 'Laptop'",
    set={"StockQty": "20"}
)

# 3. Delete any item with StockQty = 0
delta_table.delete(condition="StockQty = 0")

# 4. Describe history and query version 0 (previous state)
spark.sql("DESCRIBE HISTORY retail_inventory").show(truncate=False)

version = spark.read.format("delta").option("versionAsOf", 0).table("retail_inventory")
version.show()


+-------+-------------------+----------------+----------------------------------+---------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId          |userName                          |operation                        |operationParameters                                                                                                                                     |job 

In [0]:
#Scenario 6: Alerts from Restock Logs (Join Task)

from pyspark.sql.functions import col, lit
from delta.tables import DeltaTable

df = spark.read.table("retail_inventory")

df = df.withColumn("RestockedRecently", lit(False))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("retail_inventory")

restock_df = spark.read.option("header", True).csv("file:/Workspace/Shared/restocks_logs.csv") \
    .withColumn("QuantityAdded", col("QuantityAdded").cast("int"))

# 1. Join with inventory table to update StockQty and Calculate new stock and flag RestockedRecently = true for updated items.
inventory_df = spark.read.table("retail_inventory")

restock_updates = inventory_df.join(restock_df, on="ItemID", how="inner") \
    .withColumn("UpdatedStock", col("StockQty") + col("QuantityAdded")) \
    .withColumn("RestockedRecently", lit(True)) \
    .select("ItemID", "UpdatedStock", "RestockedRecently")

# Use MERGE INTO to update in Delta.

delta_table = DeltaTable.forName(spark, "retail_inventory")

delta_table.alias("target").merge(
    source=restock_updates.alias("source"),
    condition="target.ItemID = source.ItemID"
).whenMatchedUpdate(set={
    "StockQty": "source.UpdatedStock",
    "RestockedRecently": "source.RestockedRecently"
}).execute()
spark.read.table("retail_inventory").select("ItemID", "StockQty", "RestockedRecently").show()


+------+--------+-----------------+
|ItemID|StockQty|RestockedRecently|
+------+--------+-----------------+
|  I003|      40|            false|
|  I004|       5|            false|
|  I002|      50|             true|
|  I001|     110|             true|
|  I005|      18|             true|
+------+--------+-----------------+



In [0]:
#  Scenario 7: Report Generation with SQL Views
# Tasks:
# 1. Create SQL view inventory_summary with:ItemName, Category, StockQty, NeedsReorder, TotalStockValue
from pyspark.sql.functions import col, expr

df_summary = df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel")) \
               .withColumn("TotalStockValue", col("StockQty") * col("UnitPrice")) \
               .select("ItemName", "Category", "StockQty", "NeedsReorder", "TotalStockValue")

df_summary.createOrReplaceTempView("inventory_summary")

spark.sql("SELECT * FROM inventory_summary").show()

# 2. Create view supplier_leaderboard sorted by average price

spark.sql("""
    CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
    SELECT Supplier, ROUND(AVG(UnitPrice), 2) AS AvgUnitPrice
    FROM retail_inventory
    GROUP BY Supplier
    ORDER BY AvgUnitPrice ASC
""")

spark.sql("SELECT * FROM supplier_leaderboard").show()



+------------+-----------+--------+------------+---------------+
|    ItemName|   Category|StockQty|NeedsReorder|TotalStockValue|
+------------+-----------+--------+------------+---------------+
|      LED TV|Electronics|      50|       false|        1500000|
|      Laptop|Electronics|      10|        true|         700000|
|Office Chair|  Furniture|      40|       false|         240000|
|Refrigerator| Appliances|       5|        true|         125000|
|     Printer|Electronics|       3|        true|          24000|
+------------+-----------+--------+------------+---------------+

+---------+------------+
| Supplier|AvgUnitPrice|
+---------+------------+
|  ChairCo|      6000.0|
|PrintFast|      8000.0|
| FreezeIt|     25000.0|
|   AVTech|     30000.0|
|TechWorld|     70000.0|
+---------+------------+



In [0]:
#  Scenario 8: Advanced Filtering
# Tasks:
# 1. Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
from pyspark.sql.functions import when, col

df_categorized = df.withColumn(
    "StockCategory",
    when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked")
    .when(col("StockQty") < col("ReorderLevel"), "LowStock")
    .otherwise("Normal")
)

df_categorized.select("ItemID", "ItemName", "StockQty", "ReorderLevel", "StockCategory").show()

# 2. Use .filter() and .where() for the same and compare.
df_categorized.filter(col("StockCategory") == "Overstocked").show()
df_categorized.where(col("StockCategory") == "LowStock").show()


+------+------------+--------+------------+-------------+
|ItemID|    ItemName|StockQty|ReorderLevel|StockCategory|
+------+------------+--------+------------+-------------+
|  I001|      LED TV|      50|          20|  Overstocked|
|  I002|      Laptop|      10|          15|     LowStock|
|  I003|Office Chair|      40|          10|  Overstocked|
|  I004|Refrigerator|       5|          10|     LowStock|
|  I005|     Printer|       3|           5|     LowStock|
+------+------------+--------+------------+-------------+

+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|Supplier|NeedsReorder|StockCategory|
+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+-------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|  AVTe

In [0]:
#  Scenario 9: Feature Engineering
# Tasks:
# 1. Extract RestockMonth from LastRestocked .
from pyspark.sql.functions import month, current_date, datediff, when, col
df = df.withColumn("RestockMonth", month(col("LastRestocked")))

# 2. Create feature: StockAge = CURRENT_DATE - LastRestocked
df = df.withColumn("StockAge", datediff(current_date(), col("LastRestocked")))

# Step 3: Bucket StockAge into: New, Moderate, Stale
df = df.withColumn(
    "StockStatus",
    when(col("StockAge") <= 30, "New")
    .when((col("StockAge") > 30) & (col("StockAge") <= 90), "Moderate")
    .otherwise("Stale")
)

df.select("ItemID", "ItemName", "LastRestocked", "RestockMonth", "StockAge", "StockStatus").show()


+------+------------+-------------+------------+--------+-----------+
|ItemID|    ItemName|LastRestocked|RestockMonth|StockAge|StockStatus|
+------+------------+-------------+------------+--------+-----------+
|  I003|Office Chair|   2024-03-25|           3|     451|      Stale|
|  I004|Refrigerator|   2024-02-20|           2|     485|      Stale|
|  I002|      Laptop|   2024-04-01|           4|     444|      Stale|
|  I001|      LED TV|   2024-03-15|           3|     461|      Stale|
|  I005|     Printer|   2024-03-30|           3|     446|      Stale|
+------+------------+-------------+------------+--------+-----------+



In [0]:

# Scenario 10: Export Options
# Tasks:
# 1. Write full DataFra
# CSV for analysts
df.write.mode("overwrite").option("header", True).csv("file:/Workspace/Shared/export/inventory/full_data/csv/")
# JSON for integration
df.write.mode("overwrite") .json("file:/Workspace/Shared/export/inventory/full_data/json/")

# Delta for pipelines
df.write .format("delta").mode("overwrite").save("file:/Workspace/Shared/export/inventory/full_data/delta/")
