In [0]:
spark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum


In [0]:
#1. Ingest the CSV files into two PySpark DataFrames
# Load customers.csv from /Shared/

df_customers=spark.read.option("header",True).option("inferSchema",True)\
    .csv("file:/Workspace/Shared/customers.csv")
# Load orders.csv from /Shared/
df_orders = spark.read.option("header",True).option("inferSchema",True)\
    .csv("file:/Workspace/Shared/orders.csv")


In [0]:
#2. Infer schema and print the schema for both
# Print schema of customers DataFrame
df_customers.printSchema()

# Print schema of orders DataFrame
df_orders.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [0]:
# 3. Add a column TotalAmount = Quantity * Price to orders
# Add TotalAmount column to orders
df_orders = df_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))

# Display updated orders
df_orders.show()


+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 4. Join both DataFrames on CustomerID
# Join customers and orders on CustomerID
df_joined = df_orders.join(df_customers, on="CustomerID", how="inner")

# Display the joined result
df_joined.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
#5. Filter orders where TotalAmount > 20000
# Filter orders with TotalAmount > 20000
df_high_value = df_joined.filter(col("TotalAmount") > 20000)

# Show the result
df_high_value.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



In [0]:
# 6. Show customers who placed more than 1 order
# Count orders per customer
df_order_count = df_orders.groupBy("CustomerID").agg(count("OrderID").alias("OrderCount"))

# Filter those with more than 1 order
df_multiple_orders = df_order_count.filter(col("OrderCount") > 1)

# Join with customer names for clarity
df_multiple_orders = df_multiple_orders.join(df_customers, on="CustomerID")

# Show the result
df_multiple_orders.select("CustomerID", "Name", "OrderCount").show()


+----------+-----+----------+
|CustomerID| Name|OrderCount|
+----------+-----+----------+
|       101|Aditi|         2|
+----------+-----+----------+



In [0]:
#7. Group orders by City and get average order value
# Group by City and compute average TotalAmount
df_avg_order = df_joined.groupBy("City").agg(avg("TotalAmount").alias("AverageOrderValue"))

# Show the result
df_avg_order.show()


+---------+-----------------+
|     City|AverageOrderValue|
+---------+-----------------+
|Bangalore|          10000.0|
|   Mumbai|          36500.0|
|    Delhi|          50000.0|
|Hyderabad|          12000.0|
+---------+-----------------+



In [0]:
# 8. Sort orders by OrderDate in descending order

# Sort orders by OrderDate (latest first)
df_sorted_orders = df_joined.orderBy(col("OrderDate").desc())

# Show sorted orders
df_sorted_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
#9. Write the final result as a Parquet file partitioned by City
# Save the DataFrame as a Parquet file, partitioned by City
df_joined.write.mode("overwrite").partitionBy("City").parquet("/Shared/output/orders_parquet")


In [0]:
#10. Create a temporary view and run Spark SQL
# Create a temporary view
df_joined.createOrReplaceTempView("orders_view")



In [0]:
%sql
-- a) Total sales by customer
SELECT Name, SUM(TotalAmount) AS TotalSales
FROM orders_view
GROUP BY Name;

-- b) Count of products per city
SELECT City, COUNT(Product) AS ProductCount
FROM orders_view
GROUP BY City;

-- c) Top 2 cities by revenue
SELECT City, SUM(TotalAmount) AS Revenue
FROM orders_view
GROUP BY City
ORDER BY Revenue DESC
LIMIT 2;


City,Revenue
Mumbai,73000
Delhi,50000
