# Week 3 – Intro to PySpark: Processing Big Data

# Step 1: Install and Initialize PySpark

In [1]:
!pip install pyspark



In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("SupplyChainWeek3") \
    .getOrCreate()


# Step 2: Upload CSVs

In [5]:
from google.colab import files
uploaded = files.upload()


Saving inventory_table.csv to inventory_table.csv
Saving orders_table.csv to orders_table.csv
Saving suppliers_table.csv to suppliers_table.csv


# Step 3: Load Files into Spark DataFrames

In [8]:
orders_df = spark.read.csv("orders_table.csv", header=True, inferSchema=True)
suppliers_df = spark.read.csv("suppliers_table.csv", header=True, inferSchema=True)
inventory_df = spark.read.csv("inventory_table.csv", header=True, inferSchema=True)

# Step 4: Check Data Schemas

In [9]:
print("Orders Table Schema:")
orders_df.printSchema()

print("Suppliers Table Schema:")
suppliers_df.printSchema()

print("Inventory Table Schema:")
inventory_df.printSchema()


Orders Table Schema:
root
 |-- order_id: integer (nullable = true)
 |-- supplier_id: integer (nullable = true)
 |-- inventory_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- delivery_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- quantity: integer (nullable = true)

Suppliers Table Schema:
root
 |-- supplier_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: long (nullable = true)
 |-- location: string (nullable = true)

Inventory Table Schema:
root
 |-- inventory_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- reorder_level: integer (nullable = true)
 |-- supplier_id: integer (nullable = true)



# Step 5: Compute Shipment Delays

In [10]:
# Convert delivery_date to DateType
orders_df = orders_df.withColumn("delivery_date", col("delivery_date").cast("date"))

# Add delay_days
orders_df = orders_df.withColumn("delay_days", datediff(current_date(), col("delivery_date")))


# Step 6: Filter Delayed Orders

In [11]:
delayed_orders_df = orders_df.filter(col("delay_days") > 0)
delayed_orders_df.show()


+--------+-----------+------------+----------+-------------+---------+--------+----------+
|order_id|supplier_id|inventory_id|order_date|delivery_date|   status|quantity|delay_days|
+--------+-----------+------------+----------+-------------+---------+--------+----------+
|       1|          1|           1|2025-07-01|   2025-07-03|Delivered|      30|        29|
|       2|          2|           2|2025-07-05|   2025-07-23|Delivered|      10|         9|
|       3|          2|           3|2025-07-06|   2025-07-09|Delivered|      20|        23|
+--------+-----------+------------+----------+-------------+---------+--------+----------+



# Step 7: Count Delayed Orders Per Supplier

In [12]:
delay_summary = delayed_orders_df.groupBy("supplier_id") \
                                 .count() \
                                 .withColumnRenamed("count", "delayed_count")

delay_summary.show()


+-----------+-------------+
|supplier_id|delayed_count|
+-----------+-------------+
|          1|            1|
|          2|            2|
+-----------+-------------+



# Step 8: Save Results as CSV and Download

In [14]:
# Save result to output folder
delay_summary.write.mode("overwrite").csv("output/delayed_summary_csv", header=True)

# Combine CSV to single file
!cat output/delayed_summary_csv/part-*.csv > delayed_summary.csv

# Download the final CSV
files.download("delayed_summary.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>