In [19]:
! pip install pyspark



In [20]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("ECommerce Transactions") \
    .getOrCreate()

In [21]:
#1. Calculate the Total Revenue per Category
from pyspark.sql.functions import col, sum
df_sales = spark.read.csv("/content/sample_data/ecommerce_data.csv", header=True, inferSchema=True)
df_sales = df_sales.withColumn("revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100))
total_revenue_per_category = df_sales.groupBy("category").agg(sum("revenue").alias("total_revenue"))
total_revenue_per_category.show()

+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|        168.0|
|   Electronics|       2950.0|
|         Books|         80.0|
|Home Appliance|        756.0|
+--------------+-------------+



In [22]:
#2. Filter Transactions with a Discount Greater Than 10%
df_high_discount = df_sales.filter(col("discount_percentage") > 10)
df_high_discount.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|revenue|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|  127.5|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|   48.0|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|  600.0|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+



In [23]:
#3. Find the Most Expensive Product Sold
most_expensive_product = df_sales.orderBy(col("price").desc()).first()
print(most_expensive_product)

Row(transaction_id=1, customer_id=101, product='Laptop', category='Electronics', price=1000, quantity=1, discount_percentage=10, transaction_date=datetime.date(2023, 8, 1), revenue=900.0)


In [24]:
#4. Calculate the Average Quantity of Products Sold per Category
from pyspark.sql.functions import avg
average_quantity_per_category = df_sales.groupBy("category").agg(avg("quantity").alias("average_quantity"))
average_quantity_per_category.show()

+--------------+----------------+
|      category|average_quantity|
+--------------+----------------+
|       Fashion|             2.0|
|   Electronics|            1.75|
|         Books|             4.0|
|Home Appliance|             1.0|
+--------------+----------------+



In [25]:
#5. Identify Customers Who Purchased More Than One Product
from pyspark.sql.functions import col
df_multiple_products = df_sales.filter(col("quantity") > 1)
df_multiple_products.show()

+--------------+-----------+----------+-----------+-----+--------+-------------------+----------------+-------+
|transaction_id|customer_id|   product|   category|price|quantity|discount_percentage|transaction_date|revenue|
+--------------+-----------+----------+-----------+-----+--------+-------------------+----------------+-------+
|             2|        102|Smartphone|Electronics|  700|       2|                  5|      2023-08-01| 1330.0|
|             3|        103|     Shirt|    Fashion|   40|       3|                  0|      2023-08-02|  120.0|
|             5|        101|Headphones|Electronics|  100|       2|                 10|      2023-08-03|  180.0|
|             8|        107|      Book|      Books|   20|       4|                  0|      2023-08-05|   80.0|
|            10|        102|    Tablet|Electronics|  300|       2|                 10|      2023-08-06|  540.0|
+--------------+-----------+----------+-----------+-----+--------+-------------------+----------------+-

In [26]:
#6. Find the Top 3 Highest Revenue Transactions
df_sales = df_sales.withColumn("transaction_revenue", col("price") * col("quantity") * (1 - col("discount_percentage") / 100))
top_3_revenue_transactions = df_sales.orderBy(col("transaction_revenue").desc()).limit(3)
top_3_revenue_transactions.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-------------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|revenue|transaction_revenue|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-------------------+
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01| 1330.0|             1330.0|
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|  900.0|              900.0|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|  600.0|              600.0|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-------------------+



In [27]:
#7. Calculate the Total Number of Transactions per Day
from pyspark.sql.functions import count
total_transactions_per_day = df_sales.groupBy("transaction_date").agg(count("transaction_id").alias("total_transactions"))
total_transactions_per_day.show()

+----------------+------------------+
|transaction_date|total_transactions|
+----------------+------------------+
|      2023-08-03|                 2|
|      2023-08-06|                 2|
|      2023-08-01|                 2|
|      2023-08-05|                 2|
|      2023-08-04|                 1|
|      2023-08-02|                 1|
+----------------+------------------+



In [28]:
#8. Find the Customer Who Spent the Most Money
from pyspark.sql.functions import sum
total_spent_per_customer = df_sales.groupBy("customer_id").agg(sum("revenue").alias("total_spent"))
highest_spender = total_spent_per_customer.orderBy(col("total_spent").desc()).first()
print(highest_spender)

Row(customer_id=102, total_spent=1870.0)


In [29]:
#9. Calculate the Average Discount Given per Product Category
average_discount_per_category = df_sales.groupBy("category").agg(avg("discount_percentage").alias("average_discount"))
average_discount_per_category.show()

+--------------+----------------+
|      category|average_discount|
+--------------+----------------+
|       Fashion|            10.0|
|   Electronics|            8.75|
|         Books|             0.0|
|Home Appliance|            15.0|
+--------------+----------------+



In [30]:
#10. Create a New Column for Final Price After Discount
df_sales = df_sales.withColumn("final_price", col("price") * (1 - col("discount_percentage") / 100))
df_sales.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-------------------+-----------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|revenue|transaction_revenue|final_price|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-------+-------------------+-----------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|  900.0|              900.0|      900.0|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01| 1330.0|             1330.0|      665.0|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|  120.0|              120.0|       40.0|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-0