In [1]:
from google.colab import files

# This opens a file upload dialog in Colab
uploaded = files.upload()

Saving stock_movements.csv to stock_movements.csv


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Inventory Stock Analysis") \
    .getOrCreate()


In [3]:
df = spark.read.csv("stock_movements.csv", header=True, inferSchema=True)
df.show()


+----------+------------+--------+-------------+-------------+
|product_id|warehouse_id|quantity|movement_type|movement_date|
+----------+------------+--------+-------------+-------------+
|      P010|         W02|      77|           IN|   2024-09-11|
|      P010|         W03|     318|           IN|   2025-04-01|
|      P009|         W04|      83|           IN|   2024-11-10|
|      P003|         W02|      87|           IN|   2025-05-15|
|      P005|         W04|      58|           IN|   2025-04-06|
|      P002|         W02|    -439|          OUT|   2025-02-05|
|      P003|         W01|      37|           IN|   2024-12-31|
|      P006|         W02|    -348|          OUT|   2025-05-02|
|      P003|         W01|     107|           IN|   2024-07-16|
|      P008|         W02|     172|           IN|   2024-12-29|
|      P006|         W01|     355|           IN|   2025-02-28|
|      P001|         W05|      98|           IN|   2025-03-24|
|      P001|         W05|     405|           IN|   2025

In [5]:
#Clean/Transform Data
from pyspark.sql.functions import col

df = df.withColumn("quantity", col("quantity").cast("int"))
df = df.na.drop()  # drop rows with null values


#Calculate Net Stock per Product per Warehouse

In [6]:
df_agg = df.groupBy("warehouse_id", "product_id") \
           .sum("quantity") \
           .withColumnRenamed("sum(quantity)", "total_quantity")
df_agg.show()


+------------+----------+--------------+
|warehouse_id|product_id|total_quantity|
+------------+----------+--------------+
|         W03|      P004|           102|
|         W01|      P008|           124|
|         W02|      P008|           172|
|         W04|      P003|            29|
|         W05|      P001|           246|
|         W01|      P001|           -20|
|         W03|      P010|           832|
|         W01|      P004|           394|
|         W02|      P001|          -371|
|         W02|      P003|          -396|
|         W02|      P002|          -295|
|         W05|      P010|            68|
|         W03|      P009|            85|
|         W04|      P009|           366|
|         W01|      P005|           135|
|         W02|      P010|            77|
|         W04|      P010|           613|
|         W04|      P005|           389|
|         W05|      P009|           449|
|         W02|      P006|          -611|
+------------+----------+--------------+
only showing top

#Flag Overstocked / Understocked Items


In [7]:
from pyspark.sql.functions import when

df_flagged = df_agg.withColumn("stock_status",
                when(col("total_quantity") < 20, "Understocked")
                .when(col("total_quantity") > 1000, "Overstocked")
                .otherwise("Normal"))
df_flagged.show()


+------------+----------+--------------+------------+
|warehouse_id|product_id|total_quantity|stock_status|
+------------+----------+--------------+------------+
|         W03|      P004|           102|      Normal|
|         W01|      P008|           124|      Normal|
|         W02|      P008|           172|      Normal|
|         W04|      P003|            29|      Normal|
|         W05|      P001|           246|      Normal|
|         W01|      P001|           -20|Understocked|
|         W03|      P010|           832|      Normal|
|         W01|      P004|           394|      Normal|
|         W02|      P001|          -371|Understocked|
|         W02|      P003|          -396|Understocked|
|         W02|      P002|          -295|Understocked|
|         W05|      P010|            68|      Normal|
|         W03|      P009|            85|      Normal|
|         W04|      P009|           366|      Normal|
|         W01|      P005|           135|      Normal|
|         W02|      P010|   

In [8]:
# Save Output to CSV

df_flagged.write.csv("warehouse_stock_status", header=True, mode="overwrite")
