In [2]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [1]:
orders_data = [
("O001","Delhi ","Laptop","45000","2024-01-05","Completed"),
("O002","Mumbai","Mobile ","32000","05/01/2024","Completed"),
("O003","Bangalore","Tablet","30000","2024/01/06","Completed"),
("O004","Delhi","Laptop","","2024-01-07","Cancelled"),
("O005","Mumbai","Mobile","invalid","2024-01-08","Completed"),
("O006","Chennai","Tablet",None,"2024-01-08","Completed"),
("O007","Delhi","Laptop","47000","09-01-2024","Completed"),
("O008","Bangalore","Mobile","28000","2024-01-09","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed")
]

 Define ne an explicit schema \
. Create a DataFrame using the schema \
. Print schema and validate data types

In [15]:
#columns=["order_id","city" ,"product" ,"amount" ,"order_date" ,"status"]
schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True),
])
orders_df_raw=spark.createDataFrame(orders_data,schema)

In [21]:
orders_df_raw.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|    Delhi| Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai| Mobile| 32000|2024-01-05|Completed|
|    O003|Bangalore| Tablet| 30000|2024-01-06|Completed|
|    O004|    Delhi| Laptop|  NULL|2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|  NULL|2024-01-08|Completed|
|    O006|  Chennai| Tablet|  NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop| 47000|2024-01-09|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



In [19]:
orders_df_raw.printSchema()

root
 |-- order_id: string (nullable = false)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



#Convert amount to IntegerType

In [16]:
orders_df_raw = orders_df_raw.withColumn(
    "amount",
    F.when(F.col("amount").rlike("^[0-9]+$"), F.col("amount").cast("int"))
     .otherwise(None)
)

In [17]:
orders_df_raw = orders_df_raw.withColumn(
    "order_date",
    F.coalesce(
        F.to_date("order_date", "yyyy-MM-dd"),
        F.to_date("order_date", "dd-MM-yyyy"),
        F.to_date("order_date", "dd/MM/yyyy"),
        F.to_date("order_date", "yyyy/MM/dd")
    )
)

In [13]:
spark.conf.set("spark.sql.ansi.enabled", "false")

#Trim all string columns

In [20]:
orders_df_raw = orders_df_raw.withColumn(
    "city",
    F.initcap(F.trim(F.col("city")))
)
orders_df_raw = orders_df_raw.withColumn(
    "product",
    F.initcap(F.trim(F.col("product")))
)
orders_df_raw = orders_df_raw.withColumn(
    "status",
    F.initcap(F.trim(F.col("status")))
)

handle null values

In [22]:
orders_df_clean = orders_df_raw.na.drop(subset=["amount"])

Remove duplicate orders


In [25]:
orders_df_clean = (
    orders_df_clean.dropDuplicates(["order_id"])
)

In [26]:
orders_df_clean.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|    Delhi| Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai| Mobile| 32000|2024-01-05|Completed|
|    O003|Bangalore| Tablet| 30000|2024-01-06|Completed|
|    O007|    Delhi| Laptop| 47000|2024-01-09|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



. Keep only Completed orders

In [27]:
completed_orders=orders_df_clean.filter(F.col("status")=="Completed")

In [29]:
completed_orders.show()

+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O001|    Delhi| Laptop| 45000|2024-01-05|Completed|
|    O002|   Mumbai| Mobile| 32000|2024-01-05|Completed|
|    O003|Bangalore| Tablet| 30000|2024-01-06|Completed|
|    O007|    Delhi| Laptop| 47000|2024-01-09|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
+--------+---------+-------+------+----------+---------+



#Total revenue per city

In [30]:
revenue_per_city = (
    completed_orders
    .groupBy("city")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|   Mumbai|        87000|
|    Delhi|        92000|
+---------+-------------+



#Total revenue per product

In [31]:
revenue_per_product = (
    completed_orders
    .groupBy("product")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_product.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       147000|
| Mobile|        60000|
| Tablet|        30000|
+-------+-------------+



# Average order value per city

In [32]:
average_order_per_city = (
    completed_orders
    .groupBy("city")
    .agg(F.avg("amount").alias("average_order"))
)
average_order_per_city.show()

+---------+-------------+
|     city|average_order|
+---------+-------------+
|Bangalore|      29000.0|
|   Mumbai|      43500.0|
|    Delhi|      46000.0|
+---------+-------------+



# Rank cities by total revenue

In [34]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(F.col("total_revenue").desc())

ranked_cities = (
    revenue_per_city
    .withColumn("city_rank", F.rank().over(city_rank_window))
)
top_city=ranked_cities.first()
ranked_cities.show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        92000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
+---------+-------------+---------+



# Identify top-performing city

In [35]:
print(top_city["city"])

Delhi


. Cache the cleaned DataFrame
. Run two aggregations and observe behavior
. Use explain(True) to inspect the plan

In [36]:

completed_orders.cache()
completed_orders.count()


6

In [38]:
revenue_per_city2 = (
    completed_orders
    .groupBy("city")
    .agg(F.sum("amount").alias("total_revenue"))
)


In [39]:
revenue_per_city2.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|   Mumbai|        87000|
|    Delhi|        92000|
|Bangalore|        58000|
+---------+-------------+



In [40]:
revenue_per_city2.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, 'sum('amount) AS total_revenue#1060]
+- Filter (status#119 = Completed)
   +- Deduplicate [order_id#90]
      +- Filter atleastnnonnulls(1, amount#96)
         +- Project [order_id#90, city#117, product#118, amount#96, order_date#97, initcap(trim(status#95, None)) AS status#119]
            +- Project [order_id#90, city#117, initcap(trim(product#92, None)) AS product#118, amount#96, order_date#97, status#95]
               +- Project [order_id#90, initcap(trim(city#91, None)) AS city#117, product#92, amount#96, order_date#97, status#95]
                  +- Project [order_id#90, city#91, product#92, amount#96, coalesce(to_date(order_date#94, Some(yyyy-MM-dd), Some(Etc/UTC), false), to_date(order_date#94, Some(dd-MM-yyyy), Some(Etc/UTC), false), to_date(order_date#94, Some(dd/MM/yyyy), Some(Etc/UTC), false), to_date(order_date#94, Some(yyyy/MM/dd), Some(Etc/UTC), false)) AS order_date#97, status#95]
                     +- Project [o

The explain(True) output shows InMemoryRelation and InMemoryTableScan [city#117, amount#96], which means Spark read the data directly from the cached DataFrame instead of recomputing all previous transformations for the completed_orders dataframe.\
This helps in saving time and resources as Spark only scans the needed columns from memory and runs the aggregation faster, avoiding expensive reprocessing of the original data.