# Week 3- Order Analysis with PySpark

# Step 1: Install PySpark

In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, datediff, current_date, sum as spark_sum

#Initialize spark
spark = SparkSession.builder.appName("Week3_CustomerOrderAnalysis").getOrCreate()

# Step 2: Upload CSV

In [3]:
from google.colab import files
uploaded = files.upload()

Saving customer_table.csv to customer_table.csv
Saving delivery_status_table.csv to delivery_status_table.csv
Saving orders_table.csv to orders_table.csv


# Step 3: Load CSV Files into Spark DataFrames

In [11]:
orders = spark.read.csv("orders_table.csv", header=True, inferSchema=True)
delivery_status = spark.read.csv("delivery_status_table.csv", header=True, inferSchema=True)
customers = spark.read.csv("customer_table.csv", header=True, inferSchema=True)

# Preview the data
orders.show(3)
delivery_status.show(3)
customers.show(3)



+--------+-----------+----------+-------------+---------+
|order_id|customer_id|order_date|delivery_date|   status|
+--------+-----------+----------+-------------+---------+
|       1|          1|2024-07-01|   2024-07-03|delivered|
|       2|          2|2024-07-05|   2024-07-08|delivered|
|       3|          3|2024-07-10|   2024-07-12|  delayed|
+--------+-----------+----------+-------------+---------+
only showing top 3 rows

+-----------+--------+--------------+-------------------+
|delivery_id|order_id|current_status|       last_updated|
+-----------+--------+--------------+-------------------+
|          1|       1|     delivered|2024-07-03 10:00:00|
|          2|       2|     delivered|2024-07-08 15:30:00|
|          3|       3|    in transit|2024-07-20 11:45:00|
+-----------+--------+--------------+-------------------+
only showing top 3 rows

+-----------+------------+-----------------+----------+------+
|customer_id|        name|            email|     phone|region|
+-----------

# Step 4: Join Orders with Customers on customer_id

In [17]:
orders_customers = orders.join(customers, on="customer_id", how="left")
orders_customers.show(5)

+-----------+--------+----------+-------------+---------+----------+-------+------------+------------------+----------+------+
|customer_id|order_id|order_date|delivery_date|   status|delay_days|delayed|        name|             email|     phone|region|
+-----------+--------+----------+-------------+---------+----------+-------+------------+------------------+----------+------+
|          1|       1|2024-07-01|   2024-07-03|delivered|       394|      1| rahul kumar| rahul@example.com|9876543210| north|
|          2|       2|2024-07-05|   2024-07-08|delivered|       389|      1|anita sharma| anita@example.com|9123456780| south|
|          3|       3|2024-07-10|   2024-07-12|  delayed|       385|      1|   vijay rao| vijay@example.com|9988776655|  east|
|          4|       4|2024-07-12|   2024-07-15|  delayed|       382|      1|swathi menon|swathi@example.com|9090909090|  west|
|          1|       5|2024-07-18|   2024-07-20|  delayed|       377|      1| rahul kumar| rahul@example.com|987

# Step 5: Group by region and Count Delayed Orders

In [20]:
from pyspark.sql.functions import datediff, current_date, when, sum as spark_sum

# Recalculate delay_days and delayed in orders
orders = orders.withColumn("delay_days", datediff(current_date(), col("delivery_date")))
orders = orders.withColumn("delayed", when(col("delay_days") > 0, 1).otherwise(0))

orders_customers = orders.join(customers, on="customer_id", how="left")

# Group by region and count delayed
region_summary = orders_customers.groupBy("region") \
    .agg(spark_sum("delayed").alias("total_delayed_orders")) \
    .orderBy(col("total_delayed_orders").desc())

region_summary.show()


+------+--------------------+
|region|total_delayed_orders|
+------+--------------------+
| north|                   2|
|  west|                   1|
|  east|                   1|
| south|                   1|
+------+--------------------+



# Step 6: Save the Output as CSV

In [22]:
#  Save to CSV
region_summary.write.mode("overwrite").csv("output/delayed_orders_by_region", header=True)

# Merge parts into one file
!cat output/delayed_orders_by_region/part-*.csv > delayed_orders_by_region.csv

# 2. Download the Output file showing delayed orders by region
from google.colab import files
files.download("delayed_orders_by_region.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>