**#  Customer Orders Analysis using PySpark**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, count, avg, desc, sum
from pyspark.sql.types import DateType

# 1. Spark session
spark = SparkSession.builder.appName("Customer Orders Analysis").getOrCreate()

# 2. Load CSV files
df_customers = spark.read.option("header", "true").option("inferSchema", "true").csv("customers.csv")
df_orders = spark.read.option("header", "true").option("inferSchema", "true").csv("orders.csv")


In [2]:
# 3. Print schema
df_customers.printSchema()
df_orders.printSchema()


root
 |-- CustomerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Age: integer (nullable = true)

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: integer (nullable = true)
 |-- OrderDate: date (nullable = true)



In [3]:
# 4. Add TotalAmount = Quantity * Price
df_orders = df_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))

# 5. Join on CustomerID
df_joined = df_orders.join(df_customers, on="CustomerID", how="inner")


In [4]:
# 6. Filter orders where TotalAmount > 20000
high_value_orders = df_joined.filter(col("TotalAmount") > 20000)
high_value_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|  City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|Mumbai| 28|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan| Delhi| 35|
+----------+-------+-------+--------+-----+----------+-----------+-----+------+---+



In [5]:
# 7. Customers with more than 1 order
multiple_orders = df_joined.groupBy("CustomerID", "Name").count().filter(col("count") > 1)
multiple_orders.show()


+----------+-----+-----+
|CustomerID| Name|count|
+----------+-----+-----+
|       101|Aditi|    2|
+----------+-----+-----+



In [6]:
# 8. Average order value by City
avg_order_city = df_joined.groupBy("City").agg(avg("TotalAmount").alias("AverageOrderValue"))
avg_order_city.show()


+---------+-----------------+
|     City|AverageOrderValue|
+---------+-----------------+
|Bangalore|          10000.0|
|   Mumbai|          36500.0|
|    Delhi|          50000.0|
|Hyderabad|          12000.0|
+---------+-----------------+



In [7]:
# 9. Sort by OrderDate descending
df_joined = df_joined.withColumn("OrderDate", col("OrderDate").cast(DateType()))
sorted_orders = df_joined.orderBy(col("OrderDate").desc())
sorted_orders.show()


+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|CustomerID|OrderID|Product|Quantity|Price| OrderDate|TotalAmount| Name|     City|Age|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+
|       104|   1005|Monitor|       1|12000|2024-04-25|      12000|Kabir|Hyderabad| 30|
|       101|   1004|  Mouse|       3| 1000|2024-04-01|       3000|Aditi|   Mumbai| 28|
|       103|   1003|   Desk|       1|10000|2024-03-15|      10000|Meena|Bangalore| 41|
|       102|   1002| Mobile|       2|25000|2024-02-10|      50000|Rohan|    Delhi| 35|
|       101|   1001| Laptop|       1|70000|2024-01-05|      70000|Aditi|   Mumbai| 28|
+----------+-------+-------+--------+-----+----------+-----------+-----+---------+---+



In [8]:
# 10. Write as Parquet partitioned by City
sorted_orders.write.mode("overwrite").partitionBy("City").parquet("customer_orders_parquet")


In [9]:
# 11. Create temp view
df_joined.createOrReplaceTempView("orders_view")

# i. Total sales by customer
spark.sql("""
SELECT Name, SUM(TotalAmount) AS TotalSales
FROM orders_view
GROUP BY Name
ORDER BY TotalSales DESC
""").show()

# ii. Count of products per city
spark.sql("""
SELECT City, COUNT(*) AS ProductCount
FROM orders_view
GROUP BY City
""").show()

# iii. Top 2 cities by revenue
spark.sql("""
SELECT City, SUM(TotalAmount) AS Revenue
FROM orders_view
GROUP BY City
ORDER BY Revenue DESC
LIMIT 2
""").show()


+-----+----------+
| Name|TotalSales|
+-----+----------+
|Aditi|     73000|
|Rohan|     50000|
|Kabir|     12000|
|Meena|     10000|
+-----+----------+

+---------+------------+
|     City|ProductCount|
+---------+------------+
|Bangalore|           1|
|   Mumbai|           2|
|    Delhi|           1|
|Hyderabad|           1|
+---------+------------+

+------+-------+
|  City|Revenue|
+------+-------+
|Mumbai|  73000|
| Delhi|  50000|
+------+-------+

