In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [3]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]

In [18]:


schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])
orders_df_raw=spark.createDataFrame(orders_data,schema)

In [19]:
orders_df_raw.show()
orders_df_raw.printSchema()

+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|  ORD001|       C001|   Delhi |Electronics|     Laptop|  45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|    Mobile |  32000|05/01/2024|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet|  30000|2024/01/06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|       |2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|invalid|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|   NULL|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop|  47000|09-01-2024|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum|  28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Lapto

In [33]:
orders_df_raw.count()

21

In [20]:
orders_df_raw = orders_df_raw.withColumn(
    "city",
    F.initcap(F.trim(F.col("city")))
)
orders_df_raw = orders_df_raw.withColumn(
    "product",
    F.initcap(F.trim(F.col("product")))
)
orders_df_raw = orders_df_raw.withColumn(
    "category",
    F.initcap(F.col("category"))
)
orders_df_raw = orders_df_raw.withColumn(
    "status",
    F.initcap(F.trim(F.col("status")))
)

In [21]:
orders_df_raw = orders_df_raw.withColumn(
    "amount",
    F.when(F.col("amount").rlike("^[0-9]+$"), F.col("amount").cast("int"))
     .otherwise(None)
)

In [22]:
orders_df=orders_df_raw.dropna(subset=["amount"])

In [23]:
orders_df = orders_df.withColumn(
    "order_date",
    F.coalesce(
        F.to_date("order_date", "yyyy-MM-dd"),
        F.to_date("order_date", "dd-MM-yyyy"),
        F.to_date("order_date", "dd/MM/yyyy"),
        F.to_date("order_date", "yyyy/MM/dd")
    )
)

In [17]:
spark.conf.set("spark.sql.ansi.enabled", "false")


In [25]:
orders_df=orders_df.drop_duplicates(subset=["order_id"])

In [31]:
orders_df=orders_df.filter(F.col("status")=="Completed")

In [32]:
orders_df.count()

17

In [34]:
orders_df.show()
orders_df.printSchema()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|2024-01-09|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|2024-01-10|Completed|
|  ORD010|       C008|    Delhi|       Home|Airpurifier| 38000|2024-01-10|Completed|
|  ORD011|       C009|   Mumbai|       Home|     Vacuum| 29000|2024-01-11|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|20

#Total revenue per city

In [35]:
revenue_per_city = (
    orders_df
    .groupBy("city")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       217000|
|  Chennai|        62000|
|   Mumbai|       185000|
|    Delhi|       187000|
+---------+-------------+



# Total revenue per category

In [36]:
revenue_per_category = (
    orders_df
    .groupBy("category")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_category.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|       187000|
|Electronics|       464000|
+-----------+-------------+



Total revenue per product

In [37]:
revenue_per_product = (
    orders_df
    .groupBy("product")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_product.show()

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|        88000|
|Airpurifier|        78000|
|     Laptop|       314000|
|      Mixer|        21000|
|     Mobile|        65000|
|     Tablet|        85000|
+-----------+-------------+



Average order value per city

In [38]:
avg_order_per_city = (
    orders_df
    .groupBy("city")
    .agg(F.avg("amount").alias("average_order"))
)
avg_order_per_city.show()

+---------+------------------+
|     city|     average_order|
+---------+------------------+
|Bangalore|36166.666666666664|
|  Chennai|           62000.0|
|   Mumbai|           37000.0|
|    Delhi|           37400.0|
+---------+------------------+



In [41]:
top_3_products=(
    revenue_per_product.orderBy(F.col("total_revenue").desc()).limit(3)
)
top_3_products.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       314000|
| Vacuum|        88000|
| Tablet|        85000|
+-------+-------------+



#Rank cities by total revenue

In [42]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(F.col("total_revenue").desc())

ranked_cities = (
    revenue_per_city
    .withColumn("city_rank", F.rank().over(city_rank_window))
)
ranked_cities.show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|Bangalore|       217000|        1|
|    Delhi|       187000|        2|
|   Mumbai|       185000|        3|
|  Chennai|        62000|        4|
+---------+-------------+---------+



#Rank products within each category by revenue

In [46]:
from pyspark.sql import functions as F

revenue_per_product_cat = (
    orders_df
    .groupBy("category", "product")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_product_cat.show()

+-----------+-----------+-------------+
|   category|    product|total_revenue|
+-----------+-----------+-------------+
|       Home|     Vacuum|        88000|
|Electronics|     Laptop|       314000|
|Electronics|     Tablet|        85000|
|Electronics|     Mobile|        65000|
|       Home|Airpurifier|        78000|
|       Home|      Mixer|        21000|
+-----------+-----------+-------------+



In [48]:
product_rank_window = Window.partitionBy("category").orderBy(F.col("total_revenue").desc())
ranked_products = revenue_per_product_cat.withColumn(
    "product_rank",
    F.rank().over(product_rank_window)
)

ranked_products.show()

+-----------+-----------+-------------+------------+
|   category|    product|total_revenue|product_rank|
+-----------+-----------+-------------+------------+
|Electronics|     Laptop|       314000|           1|
|Electronics|     Tablet|        85000|           2|
|Electronics|     Mobile|        65000|           3|
|       Home|     Vacuum|        88000|           1|
|       Home|Airpurifier|        78000|           2|
|       Home|      Mixer|        21000|           3|
+-----------+-----------+-------------+------------+



In [50]:
top_product_per_category = ranked_products.filter(F.col("product_rank") == 1)

top_product_per_category.show()

+-----------+-------+-------------+------------+
|   category|product|total_revenue|product_rank|
+-----------+-------+-------------+------------+
|Electronics| Laptop|       314000|           1|
|       Home| Vacuum|        88000|           1|
+-----------+-------+-------------+------------+



22. Cache the cleaned DataFrame
23. Run multiple aggregations and observe behavior
24. Use explain(True) to inspect shuffle and execution plan
25. Repartition data by city and explain why

In [51]:
orders_df.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: date, status: string]

In [57]:
revenue_per_category2 = (
    orders_df
    .groupBy("category")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_category2.show9=()

In [58]:
revenue_per_category2.explain(True)

== Parsed Logical Plan ==
'Aggregate ['category], ['category, 'sum('amount) AS total_revenue#2309]
+- Filter (status#229 = Completed)
   +- Deduplicate [order_id#193]
      +- Project [order_id#193, customer_id#194, city#226, category#228, product#227, amount#230, coalesce(to_date(order_date#199, Some(yyyy-MM-dd), Some(Etc/UTC), false), to_date(order_date#199, Some(dd-MM-yyyy), Some(Etc/UTC), false), to_date(order_date#199, Some(dd/MM/yyyy), Some(Etc/UTC), false), to_date(order_date#199, Some(yyyy/MM/dd), Some(Etc/UTC), false)) AS order_date#231, status#229]
         +- Filter atleastnnonnulls(1, amount#230)
            +- Project [order_id#193, customer_id#194, city#226, category#228, product#227, CASE WHEN RLIKE(amount#198, ^[0-9]+$) THEN cast(amount#198 as int) ELSE cast(null as int) END AS amount#230, order_date#199, status#229]
               +- Project [order_id#193, customer_id#194, city#226, category#228, product#227, amount#198, order_date#199, initcap(trim(status#200, None)) 

inmemorytable scan only scans the required columns eliminating recomputation

In [54]:
revenue_per_product2 = (
    orders_df
    .groupBy("product")
    .agg(F.sum("amount").alias("total_revenue"))
)

In [56]:

df_repartitioned = orders_df.repartition("city")
df_repartitioned.show()


+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD018|       C008|    Delhi|       Home|     Vacuum| 31000|2024-01-14|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD014|       C004|    Delhi|Electronics|     Tablet| 26000|2024-01-12|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|2024-01-14|Completed|
|  ORD016|       C006|   Mumbai|       Home|Airpurifier| 40000|2024-01-13|Completed|
|  ORD013|       C003|Bangalore|       Home|      Mixer| 21000|20

repartition helps in reduce shuffling when performing groupby city

26. Write cleaned order-level data to Parquet
27. Write aggregated analytics to ORC
28. Read both back and validate schema

In [59]:
orders_df.write.mode("overwrite").parquet("data/orders_df")

In [60]:
revenue_per_category.write.mode("overwrite").orc("aggregated_revenue_cat_orc")

In [63]:
df_parquet= spark.read.parquet("data/orders_df")
df_parquet.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



In [61]:
df_orc=spark.read.orc("aggregated_revenue_cat_orc")
df_orc.printSchema()

root
 |-- category: string (nullable = true)
 |-- total_revenue: long (nullable = true)



Debugging

In [None]:
df = orders_df.filter(orders_df.amount > 30000).show()

In [66]:
df.show()

AttributeError: 'NoneType' object has no attribute 'show'

.show() returns None and not a df so it cant be used in assignment

corrected_version

In [68]:
df = orders_df.filter(orders_df.amount > 30000)
df.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD018|       C008|    Delhi|       Home|     Vacuum| 31000|2024-01-14|Completed|
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|2024-01-14|Completed|
|  ORD016|       C006|   Mumbai|       Home|Airpurifier| 40000|2024-01-13|Completed|
|  ORD015|       C005|  Chennai|Electronics|     Laptop| 62000|2024-01-13|Completed|
|  ORD012|       C010|Bangalore|Electronics|     Mobile| 33000|2024-01-11|Completed|
|  ORD020|       C010|Bangalore|Electronics|     Laptop| 54000|2024-01-15|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|20