In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Orders Cleaning').getOrCreate()


# PHASE 1 — INGESTION & FIRST INSPECTION

1. Read the CSV file into a DataFrame

In [2]:
df_csv=spark.read.option('header','true').csv('orders_large_bad.csv')

2. Disable schema inference and read everything as string

In [3]:
df_csv_raw=spark.read.option('header','true').option('inferSchema','false').csv('orders_large_bad.csv')
df_csv_raw.show()


+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|      Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|      Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|       Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|

3. Print schema and record count

In [4]:
df_csv_raw.printSchema()
df_csv_raw.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



300000

4. Display 20 random rows

In [5]:
df_csv_raw.orderBy("order_id").sample(0.01).show(20,False)

+-----------+-----------+-----------+-----------+-------+-------+----------+---------+
|order_id   |customer_id|city       |category   |product|amount |order_date|status   |
+-----------+-----------+-----------+-----------+-------+-------+----------+---------+
|ORD00000063|C000063    |Hyderabad  |Grocery    |Sugar  |23023  |2024-01-04|Completed|
|ORD00000133|C000133    |Kolkata    |Home       |Vacuum |invalid|2024-01-14|Completed|
|ORD00000169|C000169    |Mumbai     |Grocery    |Sugar  |52565  |2024/02/19|Completed|
|ORD00000188|C000188    |Bangalore  |Electronics|Tablet |82780  |2024-01-09|Completed|
|ORD00000282|C000282    |Pune       |Grocery    |Rice   |70336  |2024-02-12|Completed|
|ORD00000381|C000381    |Delhi      |Fashion    |Jeans  |79839  |2024-01-22|Completed|
|ORD00000417|C000417    |Delhi      |Home       |Mixer  |23287  |2024-02-27|Completed|
|ORD00000423|C000423    |Chennai    |Home       |Mixer  |88763  |2024-01-04|Completed|
|ORD00000466|C000466    |Hyderabad  |Grocer

5. Identify at least 5 data quality issues by observation

1. Leading/trailing spaces in city, category, product.

2. Mixed case values(Electronics,electronics)

3. Invalid amount values(null,"",invalid, 12,000)

4. Multiple date formats

5. Duplicate order_id

6. Inconsistent status values

6. Read the JSON file and compare schema and row count with CSV

In [6]:
df_json=spark.read.option("multiline", True).json("orders_large_bad.json")
df_json.printSchema()
df_json.count()

root
 |-- amount: string (nullable = true)
 |-- category: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- status: string (nullable = true)



1

# PHASE 2 — SCHEMA ENFORCEMENT & VALIDATION

7. Define an explicit schema using StructType

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

In [8]:
schema=StructType([
    StructField("order_id",StringType(),False),
    StructField("customer_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("category",StringType(),True),
    StructField("product",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("order_date",StringType(),True),
    StructField("status",StringType(),True)
])

8. Re-read the CSV using the defined schema

In [9]:
df=spark.read.option('header','True').schema(schema).csv('orders_large_bad.csv')


9. Identify rows that fail schema expectations

In [10]:
df.filter(df.order_id.isNull()).count()

0

10.Explain why schema inference is dangerous at scale

1. Different partitions infer different types.

2. Breaks downstream jobs.

3. Expensive full scans

4. Non-deterministic schemas at scale

# PHASE 3 — STRING CLEANING & STANDARDIZATION

11. Trim leading and trailing spaces from all string columns

In [11]:
from pyspark.sql.functions import col,trim

for c in df.columns:
  df=df.withColumn(c,trim(col(c)))

12. Standardize city , category , and product values

In [12]:
from pyspark.sql.functions import lower

In [13]:
df=df.withColumn("city",lower(col("city")))\
.withColumn("category",lower(col("category")))\
.withColumn("product",lower(col("product")))


13. Convert all categorical columns to a consistent case

In [14]:
df=df.withColumn("status",lower(col("status")))

14. Identify how many distinct city values existed before vs after cleaning

In [15]:
df_csv_raw.select("city").distinct().count()

14

In [16]:
df.select("city").distinct().count()

7

# PHASE 4 — AMOUNT CLEANING (CRITICAL)

15. Identify invalid values in the amount column

In [17]:
df.filter(~col("amount").rlike("^[0-9]+$")).count()

28147

16. Remove commas from numeric strings

In [18]:
from pyspark.sql.functions import regexp_replace
df=df.withColumn("amount_clean",regexp_replace(col("amount"),",",""))


17. Convert amount to IntegerType safely

In [20]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
df=df.withColumn("amount_int",F.when(col("amount_clean").rlike("^[0-9]+$"),col("amount_clean").cast(IntegerType())).otherwise(None)
                 )

18. Handle empty, null, and invalid values explicitly

In [21]:
df=df.filter(col("amount_int").isNotNull())

19. Count how many records were affected during amount cleaning

In [22]:
initial_count = df_csv_raw.rdd.count()
cleaned_count = df.count()
print(initial_count - cleaned_count)

25164


# PHASE 5 — DATE PARSING & NORMALIZATION

20. Identify all date formats present in order_date

yyyy-MM-dd

dd/MM/yyyy

yyyy/MM/dd

21. Parse valid dates into DateType

22. Handle invalid dates gracefully

23. Create a clean order_date_clean column


In [23]:
from pyspark.sql.functions import try_to_timestamp,lit,coalesce,col,date_format,to_date
df=df.withColumn("order_date_clean",
                 coalesce(
                      try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")),
                      try_to_timestamp(col("order_date"), lit("dd-MM-yyyy")),
                      try_to_timestamp(col("order_date"), lit("MM-dd-yyyy")),
                      try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")),
                      try_to_timestamp(col("order_date"), lit("MM/dd/yyyy")),
                      try_to_timestamp(col("order_date"), lit("yyyy/MM/dd"))
    ).cast(DateType()))


24. Count records with invalid dates

In [24]:
df.filter(col("order_date_clean").isNull()).count()

2378

# PHASE 6 — BUSINESS FILTERING & DEDUPLICATION

25. Identify duplicate order_id values

In [25]:
df.groupBy("order_id").count().filter(col("count")>1).show()

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



26. Remove duplicate orders safely

In [26]:
df=df.dropDuplicates(["order_id"])

27. Keep only records with status = Completed

In [27]:
df=df.filter(col("status")=="completed")

28. Validate record counts before and after filtering

In [28]:
df.count()

261095

# PHASE 7 — PERFORMANCE & PARTITION AWARENESS

29. Check the default number of partitions

In [29]:
df.rdd.getNumPartitions()

2

30. Run a heavy groupBy and observe execution time

31. Use explain(True) to identify shuffle stages

33.Compare execution plans before and after repartition

In [30]:
df.groupBy("city").sum("amount_int").explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, unresolvedalias('sum(amount_int#209))]
+- Filter (status#182 = completed)
   +- Deduplicate [order_id#171]
      +- Project [order_id#171, customer_id#172, city#179, category#180, product#181, amount#176, order_date#177, status#182, amount_clean#208, amount_int#209, cast(coalesce(try_to_timestamp(order_date#177, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#177, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#177, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#177, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#177, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(order_date#177, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS order_date_clean#233]
         +- Filter isnotnull(amount_int#209)
            +- Project [order_id#171, customer_id#17

32. Repartition the DataFrame by city

In [31]:
df=df.repartition("city")

# PHASE 8 — ANALYTICS ON LARGE DATA

34. Calculate total revenue per city

In [32]:
from pyspark.sql.functions import sum,avg

In [43]:
from pyspark.sql.functions import sum
rev_city=df.groupBy("city").agg(sum("amount_int").alias("total revenue"))
rev_city.show()

+---------+-------------+
|     city|total revenue|
+---------+-------------+
|  chennai|   1629865247|
|    delhi|   1639639916|
|bangalore|   1628527093|
|hyderabad|   1642443340|
|  kolkata|   1624300497|
|   mumbai|   1625518096|
|     pune|   1646196535|
+---------+-------------+



35. Calculate total revenue per category

In [37]:
rev_category=df.groupBy("category").agg(sum("amount_int").alias("total revenue"))
rev_category.show()

+-----------+-------------+
|   category|total revenue|
+-----------+-------------+
|    grocery|   2866272106|
|electronics|   2867568870|
|       home|   2868467576|
|    fashion|   2834182172|
+-----------+-------------+



36. Calculate total revenue per product

In [38]:
rev_product=df.groupBy("product").agg(sum("amount_int").alias("total revenue"))
rev_product.show()

+-----------+-------------+
|    product|total revenue|
+-----------+-------------+
|      shoes|    946799102|
|     vacuum|    959149427|
|airpurifier|    952178123|
|     mobile|    944352576|
|     tablet|    960719999|
|      sugar|    948205000|
|     laptop|    962496295|
|      mixer|    957140026|
|      jeans|    951286127|
|       rice|    954494237|
|     tshirt|    936096943|
|        oil|    963572869|
+-----------+-------------+



37. Identify top 10 products by revenue

In [39]:
rev_product.orderBy(col("total revenue").desc()).show(10, False)

+-----------+-------------+
|product    |total revenue|
+-----------+-------------+
|oil        |963572869    |
|laptop     |962496295    |
|tablet     |960719999    |
|vacuum     |959149427    |
|mixer      |957140026    |
|rice       |954494237    |
|airpurifier|952178123    |
|jeans      |951286127    |
|sugar      |948205000    |
|shoes      |946799102    |
+-----------+-------------+
only showing top 10 rows


38. Calculate average order value per city

In [40]:
avg_order_value_city=df.groupBy("city").agg(avg("amount_int").alias("average order value"))
avg_order_value_city.show()

+---------+-------------------+
|     city|average order value|
+---------+-------------------+
|  chennai|  43628.27900315863|
|    delhi|  43817.20780331374|
|bangalore| 44098.867908689645|
|hyderabad|  43708.74045293664|
|  kolkata| 43709.816662630175|
|   mumbai|  43723.75651612556|
|     pune| 43930.204013556424|
+---------+-------------------+



# PHASE 9 — WINDOW FUNCTIONS (BIG DATA SAFE)

39. Rank cities by total revenue

In [41]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number,rank,dense_rank

In [44]:
w=Window.orderBy(col("total revenue").desc())
rev_city.withColumn("rank",rank().over(w)).show()

+---------+-------------+----+
|     city|total revenue|rank|
+---------+-------------+----+
|     pune|   1646196535|   1|
|hyderabad|   1642443340|   2|
|    delhi|   1639639916|   3|
|  chennai|   1629865247|   4|
|bangalore|   1628527093|   5|
|   mumbai|   1625518096|   6|
|  kolkata|   1624300497|   7|
+---------+-------------+----+



40. Rank products within each category by revenue

In [45]:
product_revenue_by_category = df.groupBy("category", "product").agg(sum("amount_int").alias("total_revenue"))

window_spec_category = Window.partitionBy("category").orderBy(col("total_revenue").desc())

ranked_products_by_category = product_revenue_by_category.withColumn("rank", rank().over(window_spec_category))
ranked_products_by_category.show()

+-----------+-----------+-------------+----+
|   category|    product|total_revenue|rank|
+-----------+-----------+-------------+----+
|electronics|     laptop|    962496295|   1|
|electronics|     tablet|    960719999|   2|
|electronics|     mobile|    944352576|   3|
|    fashion|      jeans|    951286127|   1|
|    fashion|      shoes|    946799102|   2|
|    fashion|     tshirt|    936096943|   3|
|    grocery|        oil|    963572869|   1|
|    grocery|       rice|    954494237|   2|
|    grocery|      sugar|    948205000|   3|
|       home|     vacuum|    959149427|   1|
|       home|      mixer|    957140026|   2|
|       home|airpurifier|    952178123|   3|
+-----------+-----------+-------------+----+



41. Identify the top product per category

In [46]:
top_product_per_category = ranked_products_by_category.filter(col("rank") == 1)
top_product_per_category.show()

+-----------+-------+-------------+----+
|   category|product|total_revenue|rank|
+-----------+-------+-------------+----+
|electronics| laptop|    962496295|   1|
|    fashion|  jeans|    951286127|   1|
|    grocery|    oil|    963572869|   1|
|       home| vacuum|    959149427|   1|
+-----------+-------+-------------+----+



42. Identify top 3 cities using window functions

In [47]:
w=Window.orderBy(col("total revenue").desc())
top_3_cities = rev_city.withColumn("rank",rank().over(w)).filter(col("rank")<=3)
top_3_cities.show()

+---------+-------------+----+
|     city|total revenue|rank|
+---------+-------------+----+
|     pune|   1646196535|   1|
|hyderabad|   1642443340|   2|
|    delhi|   1639639916|   3|
+---------+-------------+----+



# PHASE 10 — CACHING & REUSE

43. Identify DataFrames reused multiple times

In [48]:
# DataFrames reused multiple times and good candidates for caching:
# 1. `df`: This is the main DataFrame after extensive cleaning and transformations, used in almost all subsequent analysis steps.
# 2. `df_csv_raw`: Used for initial inspection, schema comparison, and calculating the initial record count.
# 3. `rev_city`: Used to calculate total revenue per city and then for ranking cities.
# 4. `rev_product`: Used to calculate total revenue per product and then for identifying top products.
# 5. `product_revenue_by_category`: Used as an intermediate result before ranking products within categories.
# 6. `ranked_products_by_category`: Used for displaying ranked products and then for identifying the top product per category.

# Example of how you would cache a DataFrame:
# df.cache()
# rev_city.cache()
# etc.

44. Apply caching strategically

In [49]:
df.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

45. Re-run analytics and observe performance

In [50]:
print("Re-running analytics after caching 'df'...")

# 34. Calculate total revenue per city
rev_city=df.groupBy("city").agg(sum("amount_int").alias("total revenue"))
print("\nTotal revenue per city:")
rev_city.show()

# 35. Calculate total revenue per category
rev_category=df.groupBy("category").agg(sum("amount_int").alias("total revenue"))
print("\nTotal revenue per category:")
rev_category.show()

# 36. Calculate total revenue per product
rev_product=df.groupBy("product").agg(sum("amount_int").alias("total revenue"))
print("\nTotal revenue per product:")
rev_product.show()

# 37. Identify top 10 products by revenue
print("\nTop 10 products by revenue:")
rev_product.orderBy(col("total revenue").desc()).show(10, False)

# 38. Calculate average order value per city
avg_order_value_city=df.groupBy("city").agg(avg("amount_int").alias("average order value"))
print("\nAverage order value per city:")
avg_order_value_city.show()

# 39. Rank cities by total revenue
w_city=Window.orderBy(col("total revenue").desc())
ranked_cities = rev_city.withColumn("rank",rank().over(w_city))
print("\nRanked cities by total revenue:")
ranked_cities.show()

# 40. Rank products within each category by revenue
product_revenue_by_category = df.groupBy("category", "product").agg(sum("amount_int").alias("total_revenue"))
window_spec_category = Window.partitionBy("category").orderBy(col("total_revenue").desc())
ranked_products_by_category = product_revenue_by_category.withColumn("rank", rank().over(window_spec_category))
print("\nRanked products within each category by revenue:")
ranked_products_by_category.show()

# 41. Identify the top product per category
top_product_per_category = ranked_products_by_category.filter(col("rank") == 1)
print("\nTop product per category:")
top_product_per_category.show()

# 42. Identify top 3 cities using window functions
w_top_3_cities=Window.orderBy(col("total revenue").desc())
top_3_cities = rev_city.withColumn("rank",rank().over(w_top_3_cities)).filter(col("rank")<=3)
print("\nTop 3 cities:")
top_3_cities.show()

Re-running analytics after caching 'df'...

Total revenue per city:
+---------+-------------+
|     city|total revenue|
+---------+-------------+
|  chennai|   1629865247|
|    delhi|   1639639916|
|bangalore|   1628527093|
|hyderabad|   1642443340|
|  kolkata|   1624300497|
|   mumbai|   1625518096|
|     pune|   1646196535|
+---------+-------------+


Total revenue per category:
+-----------+-------------+
|   category|total revenue|
+-----------+-------------+
|    grocery|   2866272106|
|electronics|   2867568870|
|       home|   2868467576|
|    fashion|   2834182172|
+-----------+-------------+


Total revenue per product:
+-----------+-------------+
|    product|total revenue|
+-----------+-------------+
|      shoes|    946799102|
|     vacuum|    959149427|
|airpurifier|    952178123|
|     mobile|    944352576|
|     tablet|    960719999|
|      sugar|    948205000|
|     laptop|    962496295|
|      mixer|    957140026|
|      jeans|    951286127|
|       rice|    954494237|

# 46. Unpersist when cache is no longer needed

In [51]:
df.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, amount_clean: string, amount_int: int, order_date_clean: date]

# PHASE 11 — FILE FORMAT STRATEGY

48. Write the cleaned order-level dataset to Parquet

In [52]:
df.write.mode("overwrite").parquet("cleaned_orders.parquet")

49. Partition the Parquet output by city


In [53]:
df.write.mode("overwrite").partitionBy("city").parquet("cleaned_orders_partitioned.parquet")

49. Partition the Parquet output by city

In [54]:
df.write.mode("overwrite").partitionBy("city").parquet("cleaned_orders_partitioned.parquet")

50. Write aggregated analytics to ORC

In [55]:
rev_city.write.mode("overwrite").orc("rev_city.orc")
rev_category.write.mode("overwrite").orc("rev_category.orc")
rev_product.write.mode("overwrite").orc("rev_product.orc")

51. Read both formats back and validate schema

In [56]:

df_parquet_read = spark.read.parquet("cleaned_orders.parquet")
print("Schema of Parquet file (cleaned_orders.parquet):")
df_parquet_read.printSchema()


rev_city_orc_read = spark.read.orc("rev_city.orc")
print("\nSchema of ORC file (rev_city.orc):")
rev_city_orc_read.printSchema()

rev_category_orc_read = spark.read.orc("rev_category.orc")
print("\nSchema of ORC file (rev_category.orc):")
rev_category_orc_read.printSchema()

rev_product_orc_read = spark.read.orc("rev_product.orc")
print("\nSchema of ORC file (rev_product.orc):")
rev_product_orc_read.printSchema()

Schema of Parquet file (cleaned_orders.parquet):
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)


Schema of ORC file (rev_city.orc):
root
 |-- city: string (nullable = true)
 |-- total revenue: long (nullable = true)


Schema of ORC file (rev_category.orc):
root
 |-- category: string (nullable = true)
 |-- total revenue: long (nullable = true)


Schema of ORC file (rev_product.orc):
root
 |-- product: string (nullable = true)
 |-- total revenue: long (nullable = true)



52. Compare number of output files generated

In [57]:
import os

print("Files for unpartitioned Parquet (cleaned_orders.parquet):")
!ls -l cleaned_orders.parquet

print("\nFiles for partitioned Parquet (cleaned_orders_partitioned.parquet):")
!ls -lR cleaned_orders_partitioned.parquet

print("\nFiles for ORC (rev_city.orc):")
!ls -l rev_city.orc

print("\nFiles for ORC (rev_category.orc):")
!ls -l rev_category.orc

print("\nFiles for ORC (rev_product.orc):")
!ls -l rev_product.orc

Files for unpartitioned Parquet (cleaned_orders.parquet):
total 6984
-rw-r--r-- 1 root root 3054173 Dec 26 06:50 part-00000-875edfed-d4a5-440e-a60c-d7c265f56b11-c000.snappy.parquet
-rw-r--r-- 1 root root 3061343 Dec 26 06:50 part-00001-875edfed-d4a5-440e-a60c-d7c265f56b11-c000.snappy.parquet
-rw-r--r-- 1 root root 1029334 Dec 26 06:50 part-00002-875edfed-d4a5-440e-a60c-d7c265f56b11-c000.snappy.parquet
-rw-r--r-- 1 root root       0 Dec 26 06:50 _SUCCESS

Files for partitioned Parquet (cleaned_orders_partitioned.parquet):
cleaned_orders_partitioned.parquet:
total 28
drwxr-xr-x 2 root root 4096 Dec 26 06:51 'city=bangalore'
drwxr-xr-x 2 root root 4096 Dec 26 06:52 'city=chennai'
drwxr-xr-x 2 root root 4096 Dec 26 06:52 'city=delhi'
drwxr-xr-x 2 root root 4096 Dec 26 06:51 'city=hyderabad'
drwxr-xr-x 2 root root 4096 Dec 26 06:52 'city=kolkata'
drwxr-xr-x 2 root root 4096 Dec 26 06:52 'city=mumbai'
drwxr-xr-x 2 root root 4096 Dec 26 06:52 'city=pune'
-rw-r--r-- 1 root root    0 Dec 26 06:

# PHASE 12 — DEBUGGING & FAILURE SCENARIOS

53. Explain why the following line breaks pipelines:

df = df.filter(df.amount > 50000).show()

In [58]:
# The line `df = df.filter(df.amount > 50000).show()` breaks pipelines for the following reason:
#
# 1.  **`.show()` is an Action, not a Transformation**: In Apache Spark, operations are categorized as transformations or actions.
#     *   **Transformations** create a new DataFrame from an existing one (e.g., `filter()`, `withColumn()`, `groupBy()`). They are lazily evaluated and return a new DataFrame.
#     *   **Actions** trigger the execution of all preceding transformations and return results to the driver program (e.g., `show()`, `count()`, `collect()`, `write()`).
#
# 2.  **Return Value of `.show()`**: The `.show()` method prints the contents of the DataFrame to the console but it *returns `None`*.
#
# 3.  **Variable Reassignment**: When you execute `df = df.filter(df.amount > 50000).show()`, the variable `df` is reassigned the return value of `.show()`, which is `None`.
#
# **Consequence**: After this line, `df` is no longer a Spark DataFrame object; it's `None`. Any subsequent code that attempts to perform DataFrame operations on `df` will fail with an error like `AttributeError: 'NoneType' object has no attribute 'filter'` (or similar), thus breaking the entire pipeline.

54. Create a scenario that produces a NoneType error

In [59]:
from pyspark.sql import SparkSession

df_sample = spark.createDataFrame([
    (1, "apple"), (2, "banana"), (3, "cherry")
], ["id", "fruit"])

print("Original DataFrame:")
df_sample.show()

print("\nAttempting to incorrectly reassign df_sample with .show() result...")
df_sample = df_sample.filter(df_sample.id > 1).show()


print("\nAttempting to perform an operation on the now NoneType df_sample:")
df_sample.count()

Original DataFrame:
+---+------+
| id| fruit|
+---+------+
|  1| apple|
|  2|banana|
|  3|cherry|
+---+------+


Attempting to incorrectly reassign df_sample with .show() result...
+---+------+
| id| fruit|
+---+------+
|  2|banana|
|  3|cherry|
+---+------+


Attempting to perform an operation on the now NoneType df_sample:


AttributeError: 'NoneType' object has no attribute 'count'

55. Identify a transformation that causes a wide shuffle

A common transformation that causes a 'wide shuffle' in Spark is groupBy(). This happens because Spark needs to collect all rows with the same grouping key onto the same partition to perform the aggregation. This redistribution of data across the network and executors is what constitutes a wide shuffle.

56. Explain how you would debug a slow Spark job

1. Check Spark UI – Identify slow stages, skewed tasks, and heavy shuffles.
2. Look for data skew – Uneven key distribution; fix with salting or repartitioning.
3. Optimize transformations – Use reduceByKey instead of groupByKey, filter early, cache reused data.
4. Tune configs – Adjust executor memory/cores, spark.sql.shuffle.partitions.
5. Monitor GC & spills – Increase memory or use Kryo serialization if needed.
6. Avoid slow UDFs – Prefer built-in functions or Pandas UDFs.
Check logs & metrics – Use Spark History Server and executor logs.

# PHASE 13 — FINAL VALIDATION

57. Validate no nulls in critical columns

In [62]:
print("Number of nulls in order_id:", df.filter(col("order_id").isNull()).count())
print("Number of nulls in amount_int:", df.filter(col("amount_int").isNull()).count())
print("Number of nulls in order_date_clean:", df.filter(col("order_date_clean").isNull()).count())

Number of nulls in order_id: 0
Number of nulls in amount_int: 0
Number of nulls in order_date_clean: 2261


58. Confirm correct data types for all columns

In [63]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_clean: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)



59. Validate final record count

In [64]:
df.count()

261095

60. Document three optimization decisions you made

1. Caching the df DataFrame: Caching the main DataFrame (df.cache()) after transformations speeds up repeated analytical queries by avoiding recomputing its lineage.


2. Repartioning by city: Repartitioning the DataFrame by city before groupBy operations reduces shuffle overhead by co-locating data with the same key, improving aggregation efficiency.


3. Writing to Columnar Formats with Partitioning: Saving data to optimized columnar formats like Parquet/ORC and partitioning by city improves read performance for analytical queries, enabling predicate pushdown and reducing I/O.1