In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col, sum, avg, dense_rank,try_to_timestamp,lit,coalesce,initcap,lower,trim
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [45]:
spark=SparkSession.builder.appName('Marketplace Orders').getOrCreate()

In [None]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]


# PHASE 1 — SCHEMA & INGESTION

1. Define an explicit schema

In [46]:
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

2. Create a DataFrame using the schema

In [47]:
df_raw=spark.createDataFrame(data=orders_data,schema=schema)

3. Print and verify schema

In [48]:
df_raw.printSchema()
df_raw.show(truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|city     |category   |product    |amount |order_date|status   |
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD001  |C001       |Delhi    |Electronics|Laptop     |45000  |2024-01-05|Completed|
|ORD002  |C002       |Mumbai   |Electronics|Mobile     |32000  |05/01/2024|Completed|
|ORD003  |C003       |Bangalore|Electronics|Tablet     |30000  |2024/01/06|Completed|
|ORD004  |C004       |Delhi    |Electronics|Laptop     |       |2024-01-07|Cancelled|
|ORD005  |C005       |Chennai  |Electronics|Mobile     |invalid|2024-01-08|C

# PHASE 2 — DATA CLEANING

4. Trim all string columns

In [49]:
for c in df_raw.columns:
    df_raw=df_raw.withColumn(c,trim(df_raw[c]))


5. Standardize city , category , and product values

In [50]:
df_clean=df_raw\
.withColumn("city",initcap(lower(col("city"))))\
.withColumn("category",initcap(lower(col("category"))))\
.withColumn("product",initcap(lower(col("product"))))



6. Convert amount to IntegerType

In [51]:
df_clean=df_clean.withColumn(
    "amount",
    when(col("amount").rlike(r'^[0-9]+$'),col("amount").cast(IntegerType()))\
    .otherwise(None)
)

7. Handle invalid, empty, and null amount values

In [52]:
df_clean=df_clean.filter(col("amount").isNotNull())

8. Convert order_date into DateType (handle multiple formats)

In [54]:
df_clean=df_clean.withColumn(
    "order_date",
    coalesce(
        try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
        try_to_timestamp(col("order_date"), lit("dd-MM-yyyy")),
        try_to_timestamp(col("order_date"), lit("MM-dd-yyyy")),
        try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
        try_to_timestamp(col("order_date"), lit("MM/dd/yyyy")),
        try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast(DateType())
)

9. Remove duplicate order_id records

In [55]:
df_clean=df_clean.dropDuplicates(["order_id"])


10. Keep only Completed orders

In [56]:
df_clean=df_clean.filter(col("status")=="Completed")

# PHASE 3 — DATA VALIDATION

11. Count records before and after cleaning

In [57]:
print("Before: ", df_raw.count())
print("After: ", df_clean.count())

Before:  21
After:  17


12. Verify no nulls in order_id , amount , and order_date

In [None]:
df_clean.select([
    count(when(col(c).isNull(),"*")).alias(c) for c in df_clean.columns
])

13. Confirm correct data types

In [58]:
df_clean.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



# PHASE 4 — ANALYTICS & AGGREGATIONS

14. Total revenue per city

In [59]:
city_revenue=df_clean.groupBy("city").agg(
    sum("amount").alias("total_revenue")
)
city_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       217000|
|  Chennai|        62000|
|   Mumbai|       185000|
|    Delhi|       187000|
+---------+-------------+



15. Total revenue per category

In [60]:
category_revenue=df_clean.groupBy("category").agg(
    sum("amount").alias("total_revenue")
)
category_revenue.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|       187000|
|Electronics|       464000|
+-----------+-------------+



16. Total revenue per product

In [61]:
product_revenue=df_clean.groupBy("product").agg(
    sum("amount").alias("total_revenue")
)
product_revenue.show()


+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|        88000|
|Airpurifier|        78000|
|     Laptop|       314000|
|      Mixer|        21000|
|     Mobile|        65000|
|     Tablet|        85000|
+-----------+-------------+



17. Average order value per city

In [62]:
avg_order_city=df_clean.groupBy("city").agg(
    avg("amount").alias("avg_order_value")
)
avg_order_city.show()

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore|36166.666666666664|
|  Chennai|           62000.0|
|   Mumbai|           37000.0|
|    Delhi|           37400.0|
+---------+------------------+



18. Identify top 3 products by revenue

In [63]:
top_products=product_revenue.orderBy(col("total_revenue").desc()).limit(3)
top_products.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       314000|
| Vacuum|        88000|
| Tablet|        85000|
+-------+-------------+



# PHASE 5 — WINDOW FUNCTIONS

19. Rank cities by total revenue

In [65]:
city_window=Window.orderBy(col("total_revenue").desc())
city_rank=city_revenue\
.withColumn("rank",dense_rank().over(city_window))
city_rank.show()

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|Bangalore|       217000|   1|
|    Delhi|       187000|   2|
|   Mumbai|       185000|   3|
|  Chennai|        62000|   4|
+---------+-------------+----+



20. Rank products within each category by revenue

In [66]:
cat_prod_window=Window.partitionBy("category").orderBy(col("total_revenue").desc())
product_rank=df_clean.groupBy("category","product").agg(
    sum("amount").alias("total_revenue"))\
    .withColumn("rank",dense_rank().over(cat_prod_window))
product_rank.show()

+-----------+-----------+-------------+----+
|   category|    product|total_revenue|rank|
+-----------+-----------+-------------+----+
|Electronics|     Laptop|       314000|   1|
|Electronics|     Tablet|        85000|   2|
|Electronics|     Mobile|        65000|   3|
|       Home|     Vacuum|        88000|   1|
|       Home|Airpurifier|        78000|   2|
|       Home|      Mixer|        21000|   3|
+-----------+-----------+-------------+----+



21. Identify the top product per category


In [67]:
top_product_category=product_rank.filter(col("rank")==1)
top_product_category.show()

+-----------+-------+-------------+----+
|   category|product|total_revenue|rank|
+-----------+-------+-------------+----+
|Electronics| Laptop|       314000|   1|
|       Home| Vacuum|        88000|   1|
+-----------+-------+-------------+----+



# PHASE 6 — PERFORMANCE AWARENESS

22. Cache the cleaned DataFrame

In [68]:
df_clean.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: date, status: string]

In [69]:
df_clean.count()

17

23. Run multiple aggregations and observe behavior

In [70]:
city_revenue.show()
category_revenue.show()
product_revenue.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|    Delhi|       187000|
|Bangalore|       217000|
|   Mumbai|       185000|
|  Chennai|        62000|
+---------+-------------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|       187000|
|Electronics|       464000|
+-----------+-------------+

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|        88000|
|     Laptop|       314000|
|     Tablet|        85000|
|     Mobile|        65000|
|Airpurifier|        78000|
|      Mixer|        21000|
+-----------+-------------+



24. Use explain(True) to inspect shuffle and execution plan

In [71]:
df_clean.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [order_id#2627]
   +- Project [order_id#2627, customer_id#2628, city#2635, category#2636, product#2637, amount#2638, cast(coalesce(try_to_timestamp(order_date#2633, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date#2639, status#2634]
      +- Filter isnotnull(amount#2638)
         +- Project [order_id#2627, customer_id#2628, city#2635, category#2636, product#2637, CASE WHEN RLIKE(amount#2632, ^[0-9]+$) THEN cast(amount#2632 as int) ELSE cast(n

25. Repartition data by city and explain why

In [72]:
df_partitioned=df_clean.repartition(col("city"))
df_partitioned.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- Filter (status#2634 = Completed)
   +- Deduplicate [order_id#2627]
      +- Project [order_id#2627, customer_id#2628, city#2635, category#2636, product#2637, amount#2638, cast(coalesce(try_to_timestamp(order_date#2633, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#2633, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date#2639, status#2634]
         +- Filter isnotnull(amount#2638)
            +- Project [order_id#2627, customer_id#2628, city#2635, category#2636, product#2637, CASE WHEN RLIKE(amount#2632, ^[0

# PHASE 7 — FILE FORMAT OUTPUT

26. Write cleaned order-level data to Parquet

In [73]:
df_clean.write.mode("overwrite").parquet("/data/clean_orders_parquet")

27. Write aggregated analytics to ORC

In [74]:
city_revenue.write.mode("overwrite").orc("/data/city_revenue_orc")

28. Read both back and validate schema

In [75]:
parquet_df=spark.read.parquet("/data/clean_orders_parquet")
orc_df=spark.read.orc("/data/city_revenue_orc")

parquet_df.printSchema()
orc_df.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)



# PHASE 8 — DEBUGGING CHECK

29. Explain why this line is incorrect:
df = df.filter(df.amount > 30000).show()

In [76]:
df = df.filter(df.amount > 30000).show()

NameError: name 'df' is not defined

.show() returns None
we are overwriting df with None therefore pipeline breaks

30. Write the corrected version

In [79]:
df = df_clean.filter(col("amount") > 30000)
df.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD018|       C008|    Delhi|       Home|     Vacuum| 31000|2024-01-14|Completed|
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|2024-01-14|Completed|
|  ORD016|       C006|   Mumbai|       Home|Airpurifier| 40000|2024-01-13|Completed|
|  ORD015|       C005|  Chennai|Electronics|     Laptop| 62000|2024-01-13|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|2024-01-11|Completed|
|  ORD020|       C010|Bangalore|Electronics|     Laptop| 54000|2024-01-15|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|20