In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Customer_Orders_Analysis").getOrCreate()

In [0]:

customers_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/customers.csv")
orders_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/orders.csv")


In [0]:
print("Customers Schema:")
customers_df.printSchema()

print("Orders Schema:")
orders_df.printSchema()


Customers Schema:
root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

Orders Schema:
root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [0]:
#3. Add a column TotalAmount = Quantity * Price to orders
from pyspark.sql.functions import col

orders_df.withColumn("TotalAmount", col("Quantity") * col("Price")).show()


+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 4. Join both DataFrames on CustomerID
joined_df = orders_df.join(customers_df, on="CustomerID", how="inner")
joined_df.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [0]:
# 5. Filter orders where TotalAmount > 20000
orders_df.filter(col("TotalAmount") > 20000).show()

# 6. Show customers who placed more than 1 order
from pyspark.sql.functions import count
orders_df.groupBy("CustomerID").agg(count("OrderID").alias("OrderCount")) \
                                 .filter(col("OrderCount") > 1).show()


+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
+-------+----------+-------+--------+-----+----------+-----------+

+----------+----------+
|CustomerID|OrderCount|
+----------+----------+
|       101|         2|
+----------+----------+



In [0]:
# 7. Group orders by City and get average order value
from pyspark.sql.functions import avg
joined_df.groupBy("City").agg(avg("TotalAmount").alias("AvgOrderValue")).show()

# 8. Sort orders by OrderDate in descending order
orders_df.orderBy(col("OrderDate").desc()).show()


+---------+-------------+
|     City|AvgOrderValue|
+---------+-------------+
|Bangalore|      10000.0|
|   Mumbai|      36500.0|
|    Delhi|      50000.0|
|Hyderabad|      12000.0|
+---------+-------------+

+-------+----------+-------+--------+-----+----------+-----------+
|OrderID|CustomerID|Product|Quantity|Price| OrderDate|TotalAmount|
+-------+----------+-------+--------+-----+----------+-----------+
|   1005|       104|Monitor|       1|12000|2024-04-25|      12000|
|   1004|       101|  Mouse|       3| 1000|2024-04-01|       3000|
|   1003|       103|   Desk|       1|10000|2024-03-15|      10000|
|   1002|       102| Mobile|       2|25000|2024-02-10|      50000|
|   1001|       101| Laptop|       1|70000|2024-01-05|      70000|
+-------+----------+-------+--------+-----+----------+-----------+



In [0]:
# 9. Write the final result as a Parquet file partitioned by City

joined_df.write.mode("overwrite").partitionBy("City").parquet("dbfs:/FileStore/final_orders_by_city")


In [0]:
# 10. Create a temporary view and run Spark SQL:
joined_df.createOrReplaceTempView("orders_view")
# Total sales by customer
spark.sql("""
    SELECT CustomerID, Name, SUM(TotalAmount) AS TotalSales FROM orders_view
    GROUP BY CustomerID, Name
""").show()
# Count of products per city
spark.sql("""
    SELECT City, COUNT(Product) AS ProductCount
    FROM orders_view
    GROUP BY City
""").show()
# Top 2 cities by revenue
spark.sql("""
    SELECT City, SUM(TotalAmount) AS Revenue
    FROM orders_view
    GROUP BY City
    ORDER BY Revenue DESC
    LIMIT 2
""").show()

+----------+-----+----------+
|CustomerID| Name|TotalSales|
+----------+-----+----------+
|       101|Aditi|     73000|
|       102|Rohan|     50000|
|       103|Meena|     10000|
|       104|Kabir|     12000|
+----------+-----+----------+

+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+

+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

