In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, sum, desc

spark = SparkSession.builder.appName("ECommerce-Capstone").getOrCreate()

# Customers
customers_data = [
    (1, "Rahul Sharma", "Bangalore", 28),
    (2, "Priya Singh", "Delhi", 32),
    (3, "Aman Kumar", "Hyderabad", 25),
    (4, "Sneha Reddy", "Chennai", 35),
    (5, "Arjun Mehta", "Mumbai", 30),
    (6, "Divya Nair", "Delhi", 29)
]
customers_cols = ["customer_id", "name", "city", "age"]
customers_df = spark.createDataFrame(customers_data, customers_cols)

# Products
products_data = [
    (101, "Laptop", "Electronics", 55000),
    (102, "Mobile", "Electronics", 25000),
    (103, "Headphones", "Electronics", 3000),
    (104, "Chair", "Furniture", 5000),
    (105, "Book", "Stationery", 700),
    (106, "Shoes", "Fashion", 2500)
]
products_cols = ["product_id", "product_name", "category", "price"]
products_df = spark.createDataFrame(products_data, products_cols)

# Orders
orders_data = [
    (1001, 1, 101, 1),
    (1002, 2, 102, 2),
    (1003, 1, 103, 3),
    (1004, 3, 104, 1),
    (1005, 5, 105, 5),
    (1006, 6, 106, 2),
    (1007, 7, 101, 1)  # invalid customer
]
orders_cols = ["order_id", "customer_id", "product_id", "quantity"]
orders_df = spark.createDataFrame(orders_data, orders_cols)


In [2]:
# 1. Select all customer names and their cities
customers_df.select("name", "city").show()

# 2. List all distinct product categories
products_df.select("category").distinct().show()

# 3. Filter customers older than 30
customers_df.filter(col("age") > 30).show()


+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|    Delhi|
+------------+---------+

+-----------+
|   category|
+-----------+
|Electronics|
| Stationery|
|    Fashion|
|  Furniture|
+-----------+

+-----------+-----------+-------+---+
|customer_id|       name|   city|age|
+-----------+-----------+-------+---+
|          2|Priya Singh|  Delhi| 32|
|          4|Sneha Reddy|Chennai| 35|
+-----------+-----------+-------+---+



In [3]:
# 4. Total number of orders placed per customer
orders_df.groupBy("customer_id").agg(count("*").alias("total_orders")).show()

# 5. Average age of customers per city
customers_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

# 6. Total revenue generated from each product
orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("product_name").agg(sum("revenue").alias("total_revenue")).show()


+-----------+------------+
|customer_id|total_orders|
+-----------+------------+
|          1|           2|
|          2|           1|
|          7|           1|
|          6|           1|
|          5|           1|
|          3|           1|
+-----------+------------+

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   28.0|
|    Delhi|   30.5|
|Hyderabad|   25.0|
|  Chennai|   35.0|
|   Mumbai|   30.0|
+---------+-------+

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|       Chair|         5000|
|        Book|         3500|
|      Laptop|       110000|
|       Shoes|         5000|
|      Mobile|        50000|
|  Headphones|         9000|
+------------+-------------+



In [4]:
# 7. Customers with their orders
customers_df.join(orders_df, "customer_id", "inner").show()

# 8. Orders with product name and price
orders_df.join(products_df, "product_id", "inner").show()

# 9. Customers who never placed an order
customers_df.join(orders_df, "customer_id", "left_anti").show()

# 10. Products never ordered
products_df.join(orders_df, "product_id", "left_anti").show()


+-----------+------------+---------+---+--------+----------+--------+
|customer_id|        name|     city|age|order_id|product_id|quantity|
+-----------+------------+---------+---+--------+----------+--------+
|          1|Rahul Sharma|Bangalore| 28|    1001|       101|       1|
|          1|Rahul Sharma|Bangalore| 28|    1003|       103|       3|
|          2| Priya Singh|    Delhi| 32|    1002|       102|       2|
|          3|  Aman Kumar|Hyderabad| 25|    1004|       104|       1|
|          5| Arjun Mehta|   Mumbai| 30|    1005|       105|       5|
|          6|  Divya Nair|    Delhi| 29|    1006|       106|       2|
+-----------+------------+---------+---+--------+----------+--------+

+----------+--------+-----------+--------+------------+-----------+-----+
|product_id|order_id|customer_id|quantity|product_name|   category|price|
+----------+--------+-----------+--------+------------+-----------+-----+
|       101|    1001|          1|       1|      Laptop|Electronics|55000|
|  

In [5]:
# 11. Top 3 most expensive products purchased
orders_df.join(products_df, "product_id") \
    .orderBy(desc("price")).limit(3).show()

# 12. Total revenue per category
orders_df.join(products_df, "product_id") \
    .withColumn("revenue", col("quantity") * col("price")) \
    .groupBy("category").agg(sum("revenue").alias("total_revenue")).show()

# 13. Customers sorted by total money spent
orders_df.join(products_df, "product_id") \
    .withColumn("spent", col("quantity") * col("price")) \
    .groupBy("customer_id").agg(sum("spent").alias("total_spent")) \
    .orderBy(desc("total_spent")).show()


+----------+--------+-----------+--------+------------+-----------+-----+
|product_id|order_id|customer_id|quantity|product_name|   category|price|
+----------+--------+-----------+--------+------------+-----------+-----+
|       101|    1001|          1|       1|      Laptop|Electronics|55000|
|       101|    1007|          7|       1|      Laptop|Electronics|55000|
|       102|    1002|          2|       2|      Mobile|Electronics|25000|
+----------+--------+-----------+--------+------------+-----------+-----+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
| Stationery|         3500|
|    Fashion|         5000|
|Electronics|       169000|
|  Furniture|         5000|
+-----------+-------------+

+-----------+-----------+
|customer_id|total_spent|
+-----------+-----------+
|          1|      64000|
|          7|      55000|
|          2|      50000|
|          6|       5000|
|          3|       5000|
|          5|       3500|
+-----------+---------

In [6]:
# 14. Register temp views
customers_df.createOrReplaceTempView("customers")
products_df.createOrReplaceTempView("products")
orders_df.createOrReplaceTempView("orders")

# 15. Top 2 cities by total revenue
spark.sql("""
SELECT c.city, SUM(o.quantity * p.price) AS total_revenue
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN products p ON o.product_id = p.product_id
GROUP BY c.city
ORDER BY total_revenue DESC
LIMIT 2
""").show()

# 16. Customers who spent more than 50,000
spark.sql("""
SELECT c.name, SUM(o.quantity * p.price) AS total_spent
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN products p ON o.product_id = p.product_id
GROUP BY c.name
HAVING total_spent > 50000
""").show()

# 17. Category contributing the most revenue
spark.sql("""
SELECT p.category, SUM(o.quantity * p.price) AS total_revenue
FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY p.category
ORDER BY total_revenue DESC
LIMIT 1
""").show()


+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        64000|
|    Delhi|        55000|
+---------+-------------+

+------------+-----------+
|        name|total_spent|
+------------+-----------+
|Rahul Sharma|      64000|
+------------+-----------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|       169000|
+-----------+-------------+

