In [4]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [6]:
orders_csv=spark.read \
        .option("header",True)\
        .option("inferSchema",False) \
        .csv("orders.csv")
orders_csv.show()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|      Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|      Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|       Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|

In [8]:
orders_csv.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [9]:
orders_csv.count()

300000

Explain why all columns must be treated as StringType initially.

place-holder

#PHASE 2 – Data Cleaning

Remove leading and trailing spaces from:
city
category
product \
Standardize text:
Convert city, category, and product to proper case.

In [10]:
orders_csv = orders_csv.withColumn(
    "city",
    F.initcap(F.trim(F.col("city")))
)
orders_csv = orders_csv.withColumn(
    "product",
    F.initcap(F.trim(F.col("product")))
)
orders_csv = orders_csv.withColumn(
    "category",
     F.initcap(F.trim(F.col("category")))
)

In [27]:
orders_csv.show(30)

+-----------+-----------+---------+-----------+-----------+------+----------+---------+----------------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|order_date_clean|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+----------------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|  NULL|01/01/2024|Cancelled|      2024-01-01|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|      2024-01-02|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|      2024-01-03|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|      2024-01-04|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|      2024-01-05|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|      2024-01-06|
|ORD00000006|    C000006|    Delhi|    Grocery|      Su

3. Clean the amount column: \
Remove commas. \
Replace empty strings and invalid values with null. \
Convert amount into IntegerType. \
Rows with invalid amounts must not crash the pipeline.

In [16]:
orders_csv=orders_csv.withColumn("amount",F.regexp_replace(F.col("amount"),",",""))

In [18]:
orders_csv = orders_csv.withColumn(
    "amount",
    F.when(F.col("amount").rlike("^[0-9]+$"), F.col("amount").cast("int"))
     .otherwise(None)
)

In [22]:
orders_csv.filter(F.col("amount").isNotNull()).count()

274836

9374-"" , 15790: "invalid" ,274836: int values in amount

In [24]:
orders_csv.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



4. Clean the order_date column:\
Support the following formats:\
yyyy-MM-dd\
dd/MM/yyyy\
yyyy/MM/dd\
Create a new column:
order_date_clean \
with DateType.

In [26]:
orders_csv = orders_csv.withColumn(
    "order_date_clean",
    F.coalesce(
        F.to_date("order_date", "yyyy-MM-dd"),
        F.to_date("order_date", "dd/MM/yyyy"),
        F.to_date("order_date", "yyyy/MM/dd")
    )
)

In [25]:
spark.conf.set("spark.sql.ansi.enabled", "false")

In [28]:
orders_csv.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- order_date_clean: date (nullable = true)



PHASE 3 – Data Validation
1. Count how many records had invalid amounts.
2. Count how many records had invalid dates.
3. Identify duplicate order_id values.
4. Remove duplicates using order_id.
5. Filter only records with:

In [34]:
invalid_amount_count=orders_csv.filter(F.col("amount").isNull()).count()
invalid_amount_count

25164

In [38]:
invalid_date_count=orders_csv.filter(F.col("order_date").isNull()).count()
invalid_date_count

0

In [40]:

duplicate_orders = (
    orders_csv.groupBy("order_id")
    .agg(F.count("*").alias("count"))
    .filter(F.col("count") > 1)
)

duplicate_orders.show()


+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



zero duplicate order-id

Filter only records with:

status = "Completed"

In [44]:
orders_completed=orders_csv.filter(F.col("status")=="Completed")
orders_completed.count()

285000

In [47]:
orders_completed_with_amount=orders_completed.dropna(subset=["amount"])

In [48]:
orders_completed_with_amount.count()

261095

 # PHASE 4 :Performance Engineering
1. Check the number of partitions.
2. Run a groupBy on city and calculate total revenue.
3. Use: explain(True) to analyze execution.
4. Identify where shuffle happens.
5. Repartition the dataset by city.
6. Compare execution plans before and after repartition.

In [59]:
orders_completed_with_amount.rdd.getNumPartitions()

2

In [61]:
revenue_by_city= (orders_completed_with_amount.groupBy("city").agg(F.sum("amount").alias("Total_Revenue")).orderBy(F.col("Total_Revenue").desc()))

In [62]:
revenue_by_city.show()

+---------+-------------+
|     city|Total_Revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



In [None]:
revenue_by_city.explain(True)

  +- Exchange hashpartitioning(city#138, 200), ENSURE_REQUIREMENTS : shuffle for city before aggregation \
 +- Exchange rangepartitioning(Total_Revenue#690L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=1222] : shiffle for total_revenue before sorting


In [64]:
repartition_city=orders_completed_with_amount.repartition("city")

In [66]:
revenue_by_city2= (repartition_city.groupBy("city").agg(F.sum("amount").alias("Total_Revenue")).orderBy(F.col("Total_Revenue").desc()))

In [67]:
revenue_by_city2.show()

+---------+-------------+
|     city|Total_Revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



In [None]:
revenue_by_city2.explain(True)

# PHASE 5 : Analytics
Using the cleaned dataset:
1. Total revenue per city.
2. Total revenue per category.
3. Average order value per city.
4. Top 10 products by revenue.
5. Cities sorted by revenue descending.

In [69]:
orders_completed_with_amount.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: string, status: string, order_date_clean: date]

In [70]:
revenue_per_city= (
    orders_completed_with_amount.groupBy("city").
    agg(F.sum("amount").alias("Total_Revenue")).
    orderBy(F.col("Total_Revenue").desc()))

In [71]:
revenue_per_city.show()

+---------+-------------+
|     city|Total_Revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



In [72]:
revenue_per_category= (
    orders_completed_with_amount.groupBy("category").
    agg(F.sum("amount").alias("Total_Revenue")).
    orderBy(F.col("Total_Revenue").desc()))
revenue_per_category.show()

+-----------+-------------+
|   category|Total_Revenue|
+-----------+-------------+
|       Home|   2868467576|
|Electronics|   2867568870|
|    Grocery|   2866272106|
|    Fashion|   2834182172|
+-----------+-------------+



In [73]:
avg_order_val_per_city = (
    orders_completed_with_amount
    .groupBy("city")
    .agg(F.avg("amount").alias("avg_order_val"))
)
avg_order_val_per_city.show()

+---------+------------------+
|     city|     avg_order_val|
+---------+------------------+
|Bangalore|44098.867908689645|
|  Chennai| 43628.27900315863|
|   Mumbai| 43723.75651612556|
|  Kolkata|43709.816662630175|
|     Pune|43930.204013556424|
|    Delhi| 43817.20780331374|
|Hyderabad| 43708.74045293664|
+---------+------------------+



In [75]:
revenue_per_product = (
    orders_completed_with_amount
    .groupBy("product")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_product.orderBy(F.col("total_revenue").desc()).show(5)

+-------+-------------+
|product|total_revenue|
+-------+-------------+
|    Oil|    963572869|
| Laptop|    962496295|
| Tablet|    960719999|
| Vacuum|    959149427|
|  Mixer|    957140026|
+-------+-------------+
only showing top 5 rows


In [77]:
revenue_per_city_desc= (
    orders_completed_with_amount.groupBy("city").
    agg(F.sum("amount").alias("Total_Revenue")).
    orderBy(F.col("Total_Revenue").desc()))
revenue_per_city_desc.show()


+---------+-------------+
|     city|Total_Revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



PHASE 6 : Window Functions
1. Rank cities by revenue.
2. Rank products inside each category by revenue.
3. Find the top product for every category.
4. Identify the top 3 performing cities.

In [78]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(F.col("total_revenue").desc())

ranked_cities = (
    revenue_per_city
    .withColumn("city_rank", F.rank().over(city_rank_window))
)
ranked_cities.show()

+---------+-------------+---------+
|     city|Total_Revenue|city_rank|
+---------+-------------+---------+
|     Pune|   1646196535|        1|
|Hyderabad|   1642443340|        2|
|    Delhi|   1639639916|        3|
|  Chennai|   1629865247|        4|
|Bangalore|   1628527093|        5|
|   Mumbai|   1625518096|        6|
|  Kolkata|   1624300497|        7|
+---------+-------------+---------+



In [79]:
revenue_per_category_product = (
    orders_completed
    .groupBy("category", "product")
    .agg(F.sum("amount").alias("total_revenue"))
    .orderBy(F.col("category"), F.col("total_revenue").desc())
)


w = Window.partitionBy("category").orderBy(F.col("total_revenue").desc())

ranked = (
    revenue_per_category_product
    .withColumn("rank", F.dense_rank().over(w))
    .orderBy("category", "rank", F.col("total_revenue").desc())
)

ranked.show(truncate=False)

+-----------+-----------+-------------+----+
|category   |product    |total_revenue|rank|
+-----------+-----------+-------------+----+
|Electronics|Laptop     |962496295    |1   |
|Electronics|Tablet     |960719999    |2   |
|Electronics|Mobile     |944352576    |3   |
|Fashion    |Jeans      |951286127    |1   |
|Fashion    |Shoes      |946799102    |2   |
|Fashion    |Tshirt     |936096943    |3   |
|Grocery    |Oil        |963572869    |1   |
|Grocery    |Rice       |954494237    |2   |
|Grocery    |Sugar      |948205000    |3   |
|Home       |Vacuum     |959149427    |1   |
|Home       |Mixer      |957140026    |2   |
|Home       |Airpurifier|952178123    |3   |
+-----------+-----------+-------------+----+



In [80]:
ranked.filter(F.col("rank")==1).show()

+-----------+-------+-------------+----+
|   category|product|total_revenue|rank|
+-----------+-------+-------------+----+
|Electronics| Laptop|    962496295|   1|
|    Fashion|  Jeans|    951286127|   1|
|    Grocery|    Oil|    963572869|   1|
|       Home| Vacuum|    959149427|   1|
+-----------+-------+-------------+----+



In [81]:
ranked_cities.show(3)

+---------+-------------+---------+
|     city|Total_Revenue|city_rank|
+---------+-------------+---------+
|     Pune|   1646196535|        1|
|Hyderabad|   1642443340|        2|
|    Delhi|   1639639916|        3|
+---------+-------------+---------+
only showing top 3 rows


In [83]:
city_data=[
("Delhi","North"),
("Mumbai","West"),
("Bangalore","South"),
("Hyderabad","South"),
("Pune","West"),
("Chennai","South"),
("Kolkata","East"),
]
coloumns=["city","region"]

In [84]:
city_df=spark.createDataFrame(city_data,coloumns)

In [85]:
city_df.show()

+---------+------+
|     city|region|
+---------+------+
|    Delhi| North|
|   Mumbai|  West|
|Bangalore| South|
|Hyderabad| South|
|     Pune|  West|
|  Chennai| South|
|  Kolkata|  East|
+---------+------+



1. Join the orders data with this city-region dataset.
2. Apply broadcast join explicitly.
3. Verify using the physical plan that:

In [86]:

from pyspark.sql import functions as F

joined_df = orders_completed_with_amount.join(F.broadcast(city_df), on="city", how="inner")


In [87]:
joined_df.show(3)

+---------+-----------+-----------+-----------+-------+------+----------+---------+----------------+------+
|     city|   order_id|customer_id|   category|product|amount|order_date|   status|order_date_clean|region|
+---------+-----------+-----------+-----------+-------+------+----------+---------+----------------+------+
|     Pune|ORD00000001|    C000001|    Grocery|  Sugar| 35430|2024-01-02|Completed|      2024-01-02|  West|
|     Pune|ORD00000002|    C000002|Electronics| Mobile| 65358|2024-01-03|Completed|      2024-01-03|  West|
|Bangalore|ORD00000003|    C000003|Electronics| Laptop|  5558|2024-01-04|Completed|      2024-01-04| South|
+---------+-----------+-----------+-----------+-------+------+----------+---------+----------------+------+
only showing top 3 rows


In [88]:
joined_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- Filter atleastnnonnulls(1, amount#216)
:  +- Filter (status#83 = Completed)
:     +- Project [order_id#76, customer_id#77, city#138, category#140, product#139, amount#216, order_date#82, status#83, coalesce(to_date(order_date#82, Some(yyyy-MM-dd), Some(Etc/UTC), false), to_date(order_date#82, Some(dd/MM/yyyy), Some(Etc/UTC), false), to_date(order_date#82, Some(yyyy/MM/dd), Some(Etc/UTC), false)) AS order_date_clean#304]
:        +- Project [order_id#76, customer_id#77, city#138, category#140, product#139, CASE WHEN RLIKE(amount#182, ^[0-9]+$) THEN cast(amount#182 as int) ELSE cast(null as int) END AS amount#216, order_date#82, status#83]
:           +- Project [order_id#76, customer_id#77, city#138, category#140, product#139, regexp_replace(amount#81, ,, , 1) AS amount#182, order_date#82, status#83]
:              +- Project [order_id#76, customer_id#77, city#138, initcap(trim(category#79, None)) AS category#140, product#139, 

 BroadcastHashJoin [city#138], [city#3417], Inner, BuildRight, false confirms broadcast join

Explain why broadcast join is efficient in this case.

Avoids expensive shuffle of the large table across the cluster. \
Broadcasts small table to all executors, enabling local joins. \
Reduces network and disk I/O, improving performance significantly

UDF

In [89]:

from pyspark.sql import functions as F
from pyspark.sql.types import StringType

def classify_amount(amount):
    if amount is None:
        return None
    if amount >= 80000:
        return "High"
    elif amount >= 40000:
        return "Medium"
    else:
        return "Low"

classify_amount_udf = F.udf(classify_amount, StringType())

orders_with_category = orders_completed_with_amount.withColumn(
    "order_value_category",
    classify_amount_udf(F.col("amount"))
)

distribution = (
    orders_with_category.groupBy("order_value_category")
    .agg(F.count("*").alias("count"))
    .orderBy("order_value_category")
)

distribution.show()


+--------------------+------+
|order_value_category| count|
+--------------------+------+
|                High| 27936|
|                 Low|121794|
|              Medium|111365|
+--------------------+------+



PHASE 9 : RDD
1. Convert the cleaned DataFrame to RDD.
2. Compute:
Total revenue using reduce.
Orders per city using map and reduce.
3. Explain why DataFrames are preferred over RDDs for analytics.

In [90]:
orders_rdd = orders_completed_with_amount.rdd

In [91]:

total_revenue = orders_rdd.map(lambda row: row['amount']).reduce(lambda x, y: x + y)
print(f"Total Revenue: {total_revenue}")


Total Revenue: 11436490724


In [92]:

orders_per_city = orders_rdd.map(lambda row: (row['city'], 1)).reduceByKey(lambda x, y: x + y)
print("Orders per City:")
for city, count in orders_per_city.collect():
    print(f"{city}: {count}")


Orders per City:
Pune: 37473
Delhi: 37420
Mumbai: 37177
Hyderabad: 37577
Bangalore: 36929
Kolkata: 37161
Chennai: 37358


. Why DataFrames are Preferred Over RDDs for Analytics

Optimizations: DataFrames use Catalyst optimizer and Tungsten engine for query optimization and efficient execution. \
Ease of Use: High-level APIs (SQL-like) make complex operations simpler compared to verbose RDD transformations. \
Performance: DataFrames leverage columnar storage and optimized execution plans, whereas RDDs operate at a lower level without these optimizations

PHASE 10 :Caching
1. Identify datasets reused in multiple queries.
2. Apply cache().
3. Execute multiple aggregations.
4. Compare performance.
5. Unpersist after use.

1. Write cleaned dataset to Parquet Partitioned by city

In [94]:

orders_completed_with_amount.write \
    .mode("overwrite") \
    .partitionBy("city") \
    .parquet("data/orders_clean.parquet")

In [95]:

revenue_per_city.write \
    .mode("overwrite") \
    .orc("data/revenue_city_orc")

In [98]:
parquet_df=spark.read.parquet("data/orders_clean.parquet")
parquet_df.show(3)
parquet_df.count()

+-----------+-----------+--------+-------+------+----------+---------+----------------+-------+
|   order_id|customer_id|category|product|amount|order_date|   status|order_date_clean|   city|
+-----------+-----------+--------+-------+------+----------+---------+----------------+-------+
|ORD00000027|    C000027| Fashion|  Jeans| 79049|2024-01-28|Completed|      2024-01-28|Chennai|
|ORD00000031|    C000031| Grocery|    Oil| 51151|2024-02-01|Completed|      2024-02-01|Chennai|
|ORD00000032|    C000032|    Home| Vacuum| 75797|2024-02-02|Completed|      2024-02-02|Chennai|
+-----------+-----------+--------+-------+------+----------+---------+----------------+-------+
only showing top 3 rows


261095

In [100]:
orc_df=spark.read.orc("data/revenue_city_orc")
orc_df.show()

+---------+-------------+
|     city|Total_Revenue|
+---------+-------------+
|     Pune|   1646196535|
|Hyderabad|   1642443340|
|    Delhi|   1639639916|
|  Chennai|   1629865247|
|Bangalore|   1628527093|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
+---------+-------------+



most used df: orders_completed_with_amount

PHASE 12 : Debugging
Explain why this breaks:

df = df.filter(df.amount > 50000).show()

And why after this line df is no longer a DataFrame.

show function returns a none output .so df will be none