#**Assessment-3**

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

In [0]:
spark = SparkSession.builder.appName("dbshell-01").getOrCreate()

# **Inventory Alerting System**

In [0]:
# 1.1 Load the data using PySpark.
df = spark.read.csv("/FileStore/tables/inventory_supply.csv", header=True, inferSchema=True)

In [0]:
# 1.2 Create a new column NeedsReorder = StockQty < ReorderLevel .
df = df.withColumn("NeedsReorder", F.when(df.StockQty < df.ReorderLevel, 1).otherwise(0))
df.select(["ItemName", "StockQty", "ReorderLevel", "NeedsReorder"]).show()

+------------+--------+------------+------------+
|    ItemName|StockQty|ReorderLevel|NeedsReorder|
+------------+--------+------------+------------+
|      LED TV|      50|          20|           0|
|      Laptop|      10|          15|           1|
|Office Chair|      40|          10|           0|
|Refrigerator|       5|          10|           1|
|     Printer|       3|           5|           1|
+------------+--------+------------+------------+



In [0]:
# 1.3 Create a view of all items that need restocking.
needRestock = df.filter(df.NeedsReorder == 1)
needRestock.createOrReplaceTempView("needs_restock")

# **Supplier Price Optimization**

In [0]:
# 2.1 Group items by Supplier and compute average price.
supplierAverage = df.groupBy("Supplier", "ItemName").agg(
  F.round(F.mean("UnitPrice")).alias("AverageUnitPrice")
)
supplierAverage.show()

+---------+------------+----------------+
| Supplier|    ItemName|AverageUnitPrice|
+---------+------------+----------------+
|TechWorld|      Laptop|         70000.0|
|PrintFast|     Printer|          8000.0|
| FreezeIt|Refrigerator|         25000.0|
|   AVTech|      LED TV|         30000.0|
|  ChairCo|Office Chair|          6000.0|
+---------+------------+----------------+



In [0]:
# 2.2 Find which suppliers offer items below average price in their category.
priceAverage = supplierAverage.agg(F.mean("AverageUnitPrice")).collect()[0][0]
supplierAverage.filter(supplierAverage.AverageUnitPrice <= priceAverage).show()

+---------+------------+----------------+
| Supplier|    ItemName|AverageUnitPrice|
+---------+------------+----------------+
|PrintFast|     Printer|          8000.0|
| FreezeIt|Refrigerator|         25000.0|
|  ChairCo|Office Chair|          6000.0|
+---------+------------+----------------+



In [0]:
# 2.3 Tag suppliers with Good Deal if >50% of their items are below market average.
df = df.withColumn("BelowAverage", F.when(F.col("UnitPrice") <= priceAverage, 1).otherwise(0))

average = df.groupBy("ItemName").agg(
    F.sum("BelowAverage").alias("BelowAverageCount"),
    F.count("*").alias("TotalCount"),
    F.round(F.col("BelowAverageCount") / F.col("TotalCount"), 2).alias("percentBelow")
)

supplierAverage.join(average, on="ItemName", how="inner") \
    .withColumn("Tag", F.when(F.col("percentBelow") >= 0.50, "GoodDeal").otherwise("NotGoodDeal")) \
    .select(["ItemName", "Supplier", "Tag"]) \
    .show()

+------------+---------+-----------+
|    ItemName| Supplier|        Tag|
+------------+---------+-----------+
|      Laptop|TechWorld|NotGoodDeal|
|     Printer|PrintFast|   GoodDeal|
|Refrigerator| FreezeIt|   GoodDeal|
|      LED TV|   AVTech|NotGoodDeal|
|Office Chair|  ChairCo|   GoodDeal|
+------------+---------+-----------+



# **Cost Forecasting**

In [0]:
# 3.1 Calculate TotalStockValue = StockQty * UnitPrice .
df = df.withColumn("TotalStockValue", df.StockQty * df.UnitPrice)
df.select(["ItemName", "TotalStockValue"]).show()

+------------+---------------+
|    ItemName|TotalStockValue|
+------------+---------------+
|      LED TV|        1500000|
|      Laptop|         700000|
|Office Chair|         240000|
|Refrigerator|         125000|
|     Printer|          24000|
+------------+---------------+



In [0]:
# 3.2 Identify top 3 highest-value items.
df.groupBy("ItemName") \
    .agg(
    F.sum("TotalStockValue").alias("TotalValue")
    ) \
    .sort("TotalValue", ascending=False) \
    .limit(3) \
    .show()

+------------+----------+
|    ItemName|TotalValue|
+------------+----------+
|      LED TV|   1500000|
|      Laptop|    700000|
|Office Chair|    240000|
+------------+----------+



In [0]:
# 3.3 Export the result as a Parquet file partitioned by Warehouse .
df.write.mode("overwrite").parquet("file:/Users/tharunaadhi6@gmail.com/delta_tables/export/warehouse_dataPARQUET", partitionBy="Warehouse")

# **Warehouse Utilization**

In [0]:
# 4.1 Count items stored per warehouse.
df.groupBy("Warehouse").agg(
  F.countDistinct("ItemID").alias("ItemCount")
).show()

+----------+---------+
| Warehouse|ItemCount|
+----------+---------+
|WarehouseA|        2|
|WarehouseC|        1|
|WarehouseB|        2|
+----------+---------+



In [0]:
# 4.2 Average stock per category in each warehouse.
df.groupBy("Category").agg(
    F.round(F.mean("StockQty"), 2).alias("AverageStock")
).show()

+-----------+------------+
|   Category|AverageStock|
+-----------+------------+
|Electronics|        21.0|
|  Furniture|        40.0|
| Appliances|         5.0|
+-----------+------------+



In [0]:
# 4.3 Determine underutilized warehouses ( total stock < 100 ).
df.groupBy("Warehouse") \
    .agg(
    F.sum("StockQty").alias("TotalStock")
    ) \
    .withColumn("UnderUtilized", F.when(F.col("TotalStock") < 100, "Yes").otherwise("No")) \
    .show()

+----------+----------+-------------+
| Warehouse|TotalStock|UnderUtilized|
+----------+----------+-------------+
|WarehouseA|        90|          Yes|
|WarehouseC|         5|          Yes|
|WarehouseB|        13|          Yes|
+----------+----------+-------------+



# **Delta Audit Trail**

In [0]:
# 5.1 Save as Delta table retail_inventory .
spark.sql("CREATE DATABASE IF NOT EXISTS inventory")
spark.sql("USE inventory")

df.write.format("delta").mode("overwrite").saveAsTable("inventory.retail_inventory")

In [0]:
# 5.2 Update stock of 'Laptop' to 20.
spark.sql("""
          UPDATE retail_inventory
          SET StockQty = 20
          WHERE ItemName = 'Laptop'
          """)

Out[174]: DataFrame[num_affected_rows: bigint]

In [0]:
# 5.3 Delete any item with StockQty = 0 .
spark.sql("""
          DELETE FROM retail_inventory
          WHERE StockQty = 0
          """)

Out[175]: DataFrame[num_affected_rows: bigint]

In [0]:
# 5.4 Run DESCRIBE HISTORY and query VERSION AS OF previous state.
spark.sql("""
          DESCRIBE HISTORY retail_inventory
          """).select(["version", "operation"]).show(truncate=False)

spark.sql("""
          SELECT * FROM retail_inventory VERSION AS OF 1
          """).show()

+-------+---------------------------------+
|version|operation                        |
+-------+---------------------------------+
|3      |DELETE                           |
|2      |UPDATE                           |
|1      |CREATE OR REPLACE TABLE AS SELECT|
|0      |CREATE OR REPLACE TABLE AS SELECT|
+-------+---------------------------------+

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|BelowAverage|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|           0|           0|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-0

# **Alerts from Restock Logs (Join Task)**

In [0]:
data = [
  ("I002","2024-04-20",10),
  ("I005","2024-04-22",5),
  ("I001","2024-04-25",20)
]
columns = ["ItemID","RestockDate","QuantityAdded"]

restock_logs = spark.createDataFrame(data, columns)

In [0]:
# 6.1 Join with inventory table to update StockQty.
dfJoined = df.join(restock_logs, on="ItemID", how="left")
newStockdf = dfJoined.withColumn("StockQty", 
    F.coalesce(F.col("StockQty") + F.col("QuantityAdded"), F.col("StockQty"))) \
    .withColumn("RestockDate", F.col("RestockDate")) \
    .select(df.columns)
dfJoined.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+------------+---------------+-----------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|BelowAverage|TotalStockValue|RestockDate|QuantityAdded|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+------------+---------------+-----------+-------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|           0|           0|        1500000| 2024-04-25|           20|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|           1|           0|         700000| 2024-04-20|           10|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|           0|           1|         240000|       null| 

In [0]:
# 6.2 Calculate new stock and flag RestockedRecently = true for updated items.
newStockdf = newStockdf.withColumn("RestockedRecently", 
    F.when(F.datediff(F.current_date(), F.col("LastRestocked")) < 180, True)
    .otherwise(False))

In [0]:
# 6.3 Use MERGE INTO to update in Delta.
newStockdf.createOrReplaceTempView("new_stock_update")

spark.sql("""
          MERGE INTO retail_inventory AS target
          USING new_stock_update AS source
          ON target.ItemID = source.ItemID
          WHEN MATCHED THEN UPDATE SET *
          WHEN NOT MATCHED THEN INSERT *
          """)

Out[191]: DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

#**Report Generation with SQL Views Tasks**

In [0]:
# 7.1 Create SQL view inventory_summary with:
# ItemName, Category, StockQty, NeedsReorder, TotalStockValue
df.groupBy(["ItemName", "Category"]).agg(
  F.sum("StockQty").alias("StockQty"),
  F.max("NeedsReorder").alias("NeedsReorder"),
  F.sum("TotalStockValue").alias("TotalStockValue")
).show()

+------------+-----------+--------+------------+---------------+
|    ItemName|   Category|StockQty|NeedsReorder|TotalStockValue|
+------------+-----------+--------+------------+---------------+
|      Laptop|Electronics|      10|           1|         700000|
|Office Chair|  Furniture|      40|           0|         240000|
|     Printer|Electronics|       3|           1|          24000|
|      LED TV|Electronics|      50|           0|        1500000|
|Refrigerator| Appliances|       5|           1|         125000|
+------------+-----------+--------+------------+---------------+



In [0]:
# 7.2 Create view supplier_leaderboard sorted by average price
supplierLeaderboard = supplierAverage \
    .sort("AverageUnitPrice", ascending=False) \
    .select(["Supplier", "AverageUnitPrice"]) 

supplierLeaderboard.createOrReplaceTempView("supplierLeaderboard")
supplierLeaderboard.show()

+---------+----------------+
| Supplier|AverageUnitPrice|
+---------+----------------+
|TechWorld|         70000.0|
|   AVTech|         30000.0|
| FreezeIt|         25000.0|
|PrintFast|          8000.0|
|  ChairCo|          6000.0|
+---------+----------------+



# **Advanced Filtering**

In [0]:
# 8.1 Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
df.withColumn(
  "StockStatus", 
  F.when(F.col("StockQty") > F.col("Reorderlevel") * 2, "overStocked").otherwise("Lowstock")
  ) \
  .select(["ItemName", "StockQty", "ReorderLevel", "StockStatus"]) \
  .show()

+------------+--------+------------+-----------+
|    ItemName|StockQty|ReorderLevel|StockStatus|
+------------+--------+------------+-----------+
|      LED TV|      50|          20|overStocked|
|      Laptop|      10|          15|   Lowstock|
|Office Chair|      40|          10|overStocked|
|Refrigerator|       5|          10|   Lowstock|
|     Printer|       3|           5|   Lowstock|
+------------+--------+------------+-----------+



In [0]:
# 8.2 Use .filter() and .where() for the same and compare.
df.filter(df.StockQty > 35) \
    .select(["ItemName", "StockQty"]) \
    .show()

df.where(df.StockQty > 35) \
    .select(["ItemName", "StockQty"]) \
    .show()

+------------+--------+
|    ItemName|StockQty|
+------------+--------+
|      LED TV|      50|
|Office Chair|      40|
+------------+--------+

+------------+--------+
|    ItemName|StockQty|
+------------+--------+
|      LED TV|      50|
|Office Chair|      40|
+------------+--------+



# **Feature Engineering**

In [0]:
# 9.1 Extract RestockMonth from LastRestocked .
df.withColumn("RestockMonth", F.month("LastRestocked")) \
  .select(["ItemName", "LastRestocked", "RestockMonth"]) \
  .show()

+------------+-------------+------------+
|    ItemName|LastRestocked|RestockMonth|
+------------+-------------+------------+
|      LED TV|   2024-03-15|           3|
|      Laptop|   2024-04-01|           4|
|Office Chair|   2024-03-25|           3|
|Refrigerator|   2024-02-20|           2|
|     Printer|   2024-03-30|           3|
+------------+-------------+------------+



In [0]:
# 9.2 Create feature: StockAge = CURRENT_DATE - LastRestocked
df = df.withColumn("StockAge", F.datediff(F.current_date(), F.col("LastRestocked")))

df.select(["ItemName", "LastRestocked", "StockAge"]) .show()

+------------+-------------+--------+
|    ItemName|LastRestocked|StockAge|
+------------+-------------+--------+
|      LED TV|   2024-03-15|     461|
|      Laptop|   2024-04-01|     444|
|Office Chair|   2024-03-25|     451|
|Refrigerator|   2024-02-20|     485|
|     Printer|   2024-03-30|     446|
+------------+-------------+--------+



In [0]:
# 9.3 Bucket StockAge into: New, Moderate, Stale
df.withColumn(
    "StockAge", 
    F.when(F.col("StockAge") < 180, "New") \
    .when(F.col("StockAge") < 360, "Moderate") \
    .otherwise("Stale")
    ) \
    .select(["ItemName", "LastRestocked", "StockAge"]) \
    .show()

+------------+-------------+--------+
|    ItemName|LastRestocked|StockAge|
+------------+-------------+--------+
|      LED TV|   2024-03-15|   Stale|
|      Laptop|   2024-04-01|   Stale|
|Office Chair|   2024-03-25|   Stale|
|Refrigerator|   2024-02-20|   Stale|
|     Printer|   2024-03-30|   Stale|
+------------+-------------+--------+



# **Export Options**

In [0]:
# 10.1 1. Write full DataFrame to:
# CSV for analysts
df.write.mode("overwrite").csv("file:/Users/tharunaadhi6@gmail.com/delta_tables/export/csv/inventory_details")
# JSON for integration
df.write.mode("overwrite").json("file:/Users/tharunaadhi6@gmail.com/delta_tables/export/json/inventory_intergation_data")
# Delta for pipelines
df.write.mode("overwrite").format("delta").save("/Users/tharunaadhi6@gmail.com/delta_tables/export/delta_table/inventory_dlt")



In [0]:
# 10.2 Save with meaningful file and partition names like /export/inventory/stale_items/
filterddf = df.filter(df.StockAge == "Stale")
filterddf.write.mode("overwrite").option("partitionBy", "Status").json("file:/Users/tharunaadhi6@gmail.com/delta_tables/export/inventory/stale_items/staleJSON")