In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
spark=SparkSession.builder.appName("Day 11").getOrCreate()

In [3]:
df=spark.read.csv("/content/sample_data/ecommerce_orders.csv",
inferSchema=True,
header=True)
df.show()

+--------+-----------+----------+-----------+--------------+-------------------+------+--------+------------+---------+
|order_id|customer_id|product_id|   category|payment_method|         order_time| price|quantity|total_amount|   status|
+--------+-----------+----------+-----------+--------------+-------------------+------+--------+------------+---------+
| O898230|       C187|       P40|      Books|    Debit Card|2025-08-23 00:06:00|297.45|       1|      297.45|Completed|
| O483710|       C178|       P44|     Sports|           UPI|2025-08-24 03:01:00|478.34|       3|     1435.02|Completed|
| O556243|       C139|       P46|Electronics|           UPI|2025-08-21 05:18:00|187.59|       4|      750.36|Completed|
| O745017|        C42|       P30|       Home|   Credit Card|2025-08-23 01:32:00|478.53|       4|     1914.12|Completed|
| O986474|        C79|       P23|       NULL|    Debit Card|2025-08-22 09:39:00|224.06|       1|      224.06|Completed|
| O474871|        C99|       P19|   Clot

In [4]:
df=df.withColumn("order_time",to_timestamp(col("order_time"))).withColumn("date",to_date("order_time").alias("date"))

Daily Sales Metrics

Total orders per day\
Total and average revenue per day\
Number of unique customers per day\
Percentage of cancelled orders

In [5]:
daily_metrics=df.groupBy("date").agg(
    count("order_id").alias("total_orders"),
    round(sum("total_amount"),2).alias("total_revenue"),
    round(avg("total_amount"),2).alias("average_revenue_per_order"),
    countDistinct("customer_id").alias("unique_customers"),
    round(100*(sum(when(col("status")=="Cancelled",1).otherwise(0)))/count("order_id"),2).alias("cancelled_pct")
)
daily_metrics.show()

+----------+------------+-------------+-------------------------+----------------+-------------+
|      date|total_orders|total_revenue|average_revenue_per_order|unique_customers|cancelled_pct|
+----------+------------+-------------+-------------------------+----------------+-------------+
|2025-08-20|         300|    234330.63|                    781.1|             163|        10.33|
|2025-08-22|         300|    219296.25|                   730.99|             158|         9.67|
|2025-08-23|         300|     243782.0|                   812.61|             167|         7.33|
|2025-08-21|         300|    223531.08|                    745.1|             164|         9.33|
|2025-08-24|         300|    250588.91|                    835.3|             154|        12.67|
+----------+------------+-------------+-------------------------+----------------+-------------+



Top Products Analysis

Most sold products by quantity\
Highest revenue products per day\
Product category sales distribution

In [9]:
top_products=df.groupBy("date","product_id","category").agg(
                  sum(col("quantity")).alias("total_quantity"),
                  round(sum(col("total_amount")),2).alias("total_revenue")
              ).orderBy("date",desc("total_revenue"))
top_products.show()

+----------+----------+-----------+--------------+-------------+
|      date|product_id|   category|total_quantity|total_revenue|
+----------+----------+-----------+--------------+-------------+
|2025-08-20|       P40|       Home|            18|      6025.22|
|2025-08-20|       P27|Electronics|            13|       5198.5|
|2025-08-20|       P24|      Books|            19|      4240.95|
|2025-08-20|       P40|   Clothing|             9|       3932.8|
|2025-08-20|       P26|       Home|            13|      3856.45|
|2025-08-20|       P31|   Clothing|             9|      3680.52|
|2025-08-20|       P19|     Sports|            13|      3557.63|
|2025-08-20|       P41|Electronics|            11|      3398.16|
|2025-08-20|       P14|      Books|            13|       3365.8|
|2025-08-20|       P30|     Sports|            10|       3349.2|
|2025-08-20|        P9|       Home|             7|      3039.82|
|2025-08-20|       P23|Electronics|             7|      2929.17|
|2025-08-20|       P29|  

Customer Insights

Average order value per customer\
Total orders per customer\
Top paying customers

In [12]:
cust_insight=df.groupBy("customer_id").agg(
    round(avg(col("total_amount")),2).alias("Avg_order"),
    count(col("order_id")).alias("Total_orders"),
    round(sum(col("total_amount")),2).alias("Total_spent")
).orderBy(desc("Total_spent"))
cust_insight.show()

+-----------+---------+------------+-----------+
|customer_id|Avg_order|Total_orders|Total_spent|
+-----------+---------+------------+-----------+
|        C54|   981.77|          15|   14726.59|
|        C63|  1067.23|          12|   12806.81|
|        C33|  1147.86|          11|   12626.49|
|        C45|  1470.33|           8|   11762.67|
|       C108|  1047.57|          11|   11523.32|
|        C94|   677.15|          17|   11511.58|
|        C56|  1109.63|          10|   11096.33|
|       C189|   922.83|          11|   10151.08|
|       C155|   922.03|          11|   10142.31|
|       C160|   996.72|          10|    9967.19|
|       C196|   992.93|          10|    9929.29|
|        C43|  1069.02|           9|    9621.22|
|        C62|   845.19|          11|    9297.09|
|        C76|  1327.67|           7|    9293.72|
|        C44|  1020.65|           9|    9185.83|
|       C147|  1012.51|           9|    9112.59|
|       C175|   905.01|          10|    9050.09|
|        C75|   893.

Payment Method Analysis

Count of orders by payment method\
Revenue split by payment method

In [15]:
payment_df=df.groupBy("date","payment_method").agg(
    count("order_id").alias("order_count"),
    round(sum(col("total_amount")),2).alias("total_revenue")
).orderBy("date",desc("total_revenue"))
payment_df.show()

+----------+--------------+-----------+-------------+
|      date|payment_method|order_count|total_revenue|
+----------+--------------+-----------+-------------+
|2025-08-20|           UPI|         64|     56265.15|
|2025-08-20|   Credit Card|         68|     52360.11|
|2025-08-20|        PayPal|         62|     50144.55|
|2025-08-20|    Debit Card|         56|     38762.11|
|2025-08-20|          Cash|         46|      34073.1|
|2025-08-20|          NULL|          4|      2725.61|
|2025-08-21|           UPI|         66|     53704.38|
|2025-08-21|    Debit Card|         64|     51234.18|
|2025-08-21|   Credit Card|         55|     39282.82|
|2025-08-21|        PayPal|         53|     38300.42|
|2025-08-21|          Cash|         53|     33439.34|
|2025-08-21|          NULL|          9|      7569.94|
|2025-08-22|          Cash|         67|     52474.91|
|2025-08-22|           UPI|         63|     47395.29|
|2025-08-22|   Credit Card|         58|     43246.16|
|2025-08-22|        PayPal| 