In [174]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [175]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [176]:
schema2 = StructType([
    StructField("order_id", StringType(), False),
    StructField("customer_id", StringType(), False),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount",StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

 Read the CSV file into a DataFrame

In [177]:
df_csv_1=spark.read \
        .option("header",True)\
        .option("inferSchema",False) \
        .csv("orders_large_bad.csv")
df_csv_1.show()
df_csv_1.printSchema()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|      Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|      Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|       Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|

In [178]:
df_csv_1.count()

300000

#Identify at least 5 data quality issues by observation

-> invalid value in price coloumn . not int \
-> order_date no consistent date format \
-> inconsistency in case in category coloumn \
-> cancelled orders \
->inconsistent spaces in city column

In [179]:
df_json_1=spark.read.json("orders_large_bad.json")
df_json_1.count()

300000

 Read the JSON le and compare schema and row count with CSV

In [180]:
df_json_1.printSchema()

root
 |-- amount: string (nullable = true)
 |-- category: string (nullable = true)
 |-- city: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- status: string (nullable = true)



 Re-read the CSV using the defined schema




In [181]:
df_csv=spark.read \
        .option("header",True)\
        .schema(schema2) \
        .csv("orders_large_bad.csv")
df_csv.show()
df_csv.printSchema()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|      Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|      Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|       Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|

#Explain why schema inference is dangerous at scale
Slow and costly : Spark scans data to guess types, which is expensive on large datasets.\
Unpredictable results: Mixed formats across files can lead to wrong types or silent nulls.\
Data loss risk : Invalid values get dropped without warning when inference guesses wrong.

 Trim leading and trailing spaces from all string columns \
. Standardize city , category , and product values\
 Convert all categorical columns to a consistent case\
. Identify how many distinct city values existed before vs after cleaning

In [182]:
df_csv.select("city").distinct().count()

14

In [183]:
df_csv = df_csv.withColumn(
    "city",
    F.initcap(F.trim(F.col("city")))
)
df_csv = df_csv.withColumn(
    "product",
    F.initcap(F.trim(F.col("product")))
)
df_csv = df_csv.withColumn(
    "category",
     F.initcap(F.trim(F.col("category")))
)
df_csv = df_csv.withColumn(
    "status",
    F.initcap(F.trim(F.col("status")))
)

In [184]:
df_csv.show()

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|
|ORD00000008|    C000

distinct city count after cleaning

In [185]:
df_csv.select("city").distinct().count()

7

 Identify invalid values in the amount column \
. Remove commas from numeric strings \
. Convert amount to IntegerType safely \
. Handle empty, null, and invalid values explicitly \
. Count how many records were affected during amount cleaning


In [186]:
df_csv.filter(F.col("amount").isNotNull()).count()

290626

In [187]:
df_csv=df_csv.withColumn("amount",F.regexp_replace(F.col("amount"),",",""))

In [188]:

from pyspark.sql import functions as F

int_regex = r"^[+-]?\d+$"

counts = df_csv.select(
    F.count("*").alias("total_rows"),
    F.sum(F.col("amount").isNull().cast("int")).alias("null_count"),
    F.sum((F.trim(F.col("amount")) == "").cast("int")).alias("empty_count"),
    F.sum(F.regexp_replace(F.trim(F.col("amount")), ",", "").rlike(int_regex).cast("int")).alias("valid_integer_count"),
    F.sum((~F.regexp_replace(F.trim(F.col("amount")), ",", "").rlike(int_regex) &
           F.col("amount").isNotNull() &
           (F.trim(F.col("amount")) != "")).cast("int")).alias("invalid_count")
).collect()[0].asDict()

print(counts)


{'total_rows': 300000, 'null_count': 9374, 'empty_count': 0, 'valid_integer_count': 274836, 'invalid_count': 15790}


In [189]:
df_csv = df_csv.withColumn(
    "amount",
    F.when(F.col("amount").rlike("^[0-9]+$"), F.col("amount").cast("int"))
     .otherwise(None)
)

In [190]:
df_csv.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



 Identify all date formats present in order_date \
. Parse valid dates into DateType \
. Handle invalid dates gracefully \
. Create a clean order_date_clean column \
. Count records with invalid dates

In [191]:
df_csv.filter(F.col("order_date").isNull()).count()

0

In [192]:

invalid_date_count = df_csv.filter(F.lower(F.trim(F.col("order_date"))) == "invalid_date").count()
print(invalid_date_count)


2595


In [193]:
df_csv = df_csv.withColumn(
    "order_date",
    F.coalesce(
        F.to_date("order_date", "yyyy-MM-dd"),
        F.to_date("order_date", "dd-MM-yyyy"),
        F.to_date("order_date", "dd/MM/yyyy"),
        F.to_date("order_date", "yyyy/MM/dd")
    )
)

In [194]:
df_csv.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|  NULL|2024-01-01|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalo

In [195]:
df_csv.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



 Identify duplicate order_id values

In [196]:


duplicate_count = (
    df_csv.groupBy("order_id")
    .count()
    .filter(F.col("count") > 1)
    .count()
)

print(duplicate_count)


0


In [197]:

cancelled_count = df_csv.filter(df_csv.status == "Cancelled").count()
print(cancelled_count)


15000


Keep only records with status = Completed

In [198]:
orders_completed=df_csv.filter(F.col("status")=="Completed")

In [199]:
orders_completed.count()

285000

 Check the default number of partitions

In [200]:
print(orders_completed.rdd.getNumPartitions())

2


In [201]:

df_grouped = (
    df_csv.groupBy("city", "category", "product")
          .agg(
              F.sum("amount").alias("total_amount"),
              F.countDistinct("order_id").alias("order_count"),
              F.avg("amount").alias("avg_amount")
          )
)
df_grouped.show()


+---------+-----------+-----------+------------+-----------+------------------+
|     city|   category|    product|total_amount|order_count|        avg_amount|
+---------+-----------+-----------+------------+-----------+------------------+
|     Pune|    Fashion|      Jeans|   143079026|       3587|43555.259056316594|
|Hyderabad|Electronics|     Laptop|   147952416|       3653| 43618.04716981132|
|  Kolkata|       Home|      Mixer|   141288194|       3576|  43273.5663093415|
|Bangalore|       Home|      Mixer|   140729963|       3527|44074.526464140305|
|    Delhi|Electronics|     Tablet|   148649921|       3701| 44280.58415251713|
|Bangalore|       Home|Airpurifier|   145064582|       3613|  43510.6724655069|
|Bangalore|    Grocery|       Rice|   140049951|       3443| 44530.98600953895|
|     Pune|Electronics|     Mobile|   142884549|       3557| 43735.70523415978|
|Hyderabad|    Grocery|       Rice|   143881300|       3560| 43986.94588810761|
|    Delhi|       Home|      Mixer|   14

In [202]:
df_grouped.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city, 'category, 'product], ['city, 'category, 'product, 'sum('amount) AS total_amount#3902, 'count(distinct 'order_id) AS order_count#3903, 'avg('amount) AS avg_amount#3904]
+- Project [order_id#3654, customer_id#3655, city#3702, category#3704, product#3703, amount#3788, coalesce(to_date(order_date#3660, Some(yyyy-MM-dd), Some(Etc/UTC), false), to_date(order_date#3660, Some(dd-MM-yyyy), Some(Etc/UTC), false), to_date(order_date#3660, Some(dd/MM/yyyy), Some(Etc/UTC), false), to_date(order_date#3660, Some(yyyy/MM/dd), Some(Etc/UTC), false)) AS order_date#3815, status#3705]
   +- Project [order_id#3654, customer_id#3655, city#3702, category#3704, product#3703, CASE WHEN RLIKE(amount#3758, ^[0-9]+$) THEN cast(amount#3758 as int) ELSE cast(null as int) END AS amount#3788, order_date#3660, status#3705]
      +- Project [order_id#3654, customer_id#3655, city#3702, category#3704, product#3703, regexp_replace(amount#3659, ,, , 1) AS amount#3758, order_dat

In [203]:
orders_completed_1= orders_completed.repartition("city")

In [204]:

df_grouped2 = (
    orders_completed_1.groupBy("city", "category", "product")
          .agg(
              F.sum("amount").alias("total_amount"),
              F.countDistinct("order_id").alias("order_count"),
              F.avg("amount").alias("avg_amount")
          )
)
df_grouped2.show()

+---------+-----------+-----------+------------+-----------+------------------+
|     city|   category|    product|total_amount|order_count|        avg_amount|
+---------+-----------+-----------+------------+-----------+------------------+
|Bangalore|Electronics|     Laptop|   137366576|       3424| 44155.11925425908|
|Bangalore|    Fashion|      Jeans|   139503946|       3439| 44188.76971808679|
|Bangalore|    Grocery|      Sugar|   136603472|       3332|44395.018524536885|
|Bangalore|    Grocery|        Oil|   134988449|       3347| 44477.24843492587|
|Bangalore|       Home|      Mixer|   133642196|       3336| 44296.38581372224|
|Bangalore|       Home|Airpurifier|   136819482|       3415| 43489.98156389066|
|Bangalore|    Grocery|       Rice|   133182359|       3270|44587.331436223634|
|Bangalore|    Fashion|      Shoes|   136455773|       3396| 43693.81139929555|
|Bangalore|       Home|     Vacuum|   135421563|       3337| 44154.40593413759|
|Bangalore|Electronics|     Tablet|   13

 Calculate total revenue per city \
. Calculate total revenue per category \
. Calculate total revenue per product \
. Identify top 10 products by revenue \
. Calculate average order value per city

In [205]:
revenue_per_city = (
    orders_completed_1
    .groupBy("city")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|   1628527093|
|  Chennai|   1629865247|
|   Mumbai|   1625518096|
|  Kolkata|   1624300497|
|     Pune|   1646196535|
|    Delhi|   1639639916|
|Hyderabad|   1642443340|
+---------+-------------+



In [206]:
revenue_per_category = (
    orders_completed_1
    .groupBy("category")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_category.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|   2868467576|
|    Fashion|   2834182172|
|    Grocery|   2866272106|
|Electronics|   2867568870|
+-----------+-------------+



In [207]:
revenue_per_product = (
    orders_completed_1
    .groupBy("product")
    .agg(F.sum("amount").alias("total_revenue"))
)
revenue_per_product.show()

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|    959149427|
|        Oil|    963572869|
|Airpurifier|    952178123|
|     Laptop|    962496295|
|       Rice|    954494237|
|     Tshirt|    936096943|
|      Shoes|    946799102|
|      Mixer|    957140026|
|     Mobile|    944352576|
|     Tablet|    960719999|
|      Sugar|    948205000|
|      Jeans|    951286127|
+-----------+-------------+



In [208]:
revenue_per_product.orderBy(F.col("total_revenue").desc()).show(10)

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|        Oil|    963572869|
|     Laptop|    962496295|
|     Tablet|    960719999|
|     Vacuum|    959149427|
|      Mixer|    957140026|
|       Rice|    954494237|
|Airpurifier|    952178123|
|      Jeans|    951286127|
|      Sugar|    948205000|
|      Shoes|    946799102|
+-----------+-------------+
only showing top 10 rows


In [209]:
avg_order_val_per_city = (
    orders_completed_1
    .groupBy("city")
    .agg(F.avg("amount").alias("avg_order_val"))
)
avg_order_val_per_city.show()

+---------+------------------+
|     city|     avg_order_val|
+---------+------------------+
|Bangalore|44098.867908689645|
|  Chennai| 43628.27900315863|
|   Mumbai| 43723.75651612556|
|  Kolkata|43709.816662630175|
|     Pune|43930.204013556424|
|    Delhi| 43817.20780331374|
|Hyderabad| 43708.74045293664|
+---------+------------------+



 Rank cities by total revenue

In [217]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(F.col("total_revenue").desc())

ranked_cities = (
    revenue_per_city
    .withColumn("city_rank", F.rank().over(city_rank_window))
)
ranked_cities.show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|     Pune|   1646196535|        1|
|Hyderabad|   1642443340|        2|
|    Delhi|   1639639916|        3|
|  Chennai|   1629865247|        4|
|Bangalore|   1628527093|        5|
|   Mumbai|   1625518096|        6|
|  Kolkata|   1624300497|        7|
+---------+-------------+---------+



 Rank products within each category by revenue

In [220]:


revenue_per_category_product = (
    orders_completed_1
    .groupBy("category", "product")
    .agg(F.sum("amount").alias("total_revenue"))
    .orderBy(F.col("category"), F.col("total_revenue").desc())
)


w = Window.partitionBy("category").orderBy(F.col("total_revenue").desc())

ranked = (
    revenue_per_category_product
    .withColumn("rank", F.dense_rank().over(w))
    .orderBy("category", "rank", F.col("total_revenue").desc())
)

ranked.show(truncate=False)



+-----------+-----------+-------------+----+
|category   |product    |total_revenue|rank|
+-----------+-----------+-------------+----+
|Electronics|Laptop     |962496295    |1   |
|Electronics|Tablet     |960719999    |2   |
|Electronics|Mobile     |944352576    |3   |
|Fashion    |Jeans      |951286127    |1   |
|Fashion    |Shoes      |946799102    |2   |
|Fashion    |Tshirt     |936096943    |3   |
|Grocery    |Oil        |963572869    |1   |
|Grocery    |Rice       |954494237    |2   |
|Grocery    |Sugar      |948205000    |3   |
|Home       |Vacuum     |959149427    |1   |
|Home       |Mixer      |957140026    |2   |
|Home       |Airpurifier|952178123    |3   |
+-----------+-----------+-------------+----+



In [222]:

top_per_category = (
    ranked.filter(F.col("rank") == 1)
           .drop("rank")
)
top_per_category.show()


+-----------+-------+-------------+
|   category|product|total_revenue|
+-----------+-------+-------------+
|Electronics| Laptop|    962496295|
|    Fashion|  Jeans|    951286127|
|    Grocery|    Oil|    963572869|
|       Home| Vacuum|    959149427|
+-----------+-------+-------------+



In [223]:

w = Window.orderBy(F.col("total_revenue").desc())

top_3_cities = (
    revenue_per_city
    .withColumn("rank", F.row_number().over(w))
    .filter(F.col("rank") <= 3)
    .drop("rank")
)

top_3_cities.show(truncate=False)


+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|Pune     |1646196535   |
|Hyderabad|1642443340   |
|Delhi    |1639639916   |
+---------+-------------+



In [232]:
orders_completed_1.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: date, status: string]

In [233]:

revenue_per_category_product = (
    orders_completed_1
    .groupBy("category", "product")
    .agg(F.sum("amount").alias("total_revenue"))
    .orderBy(F.col("category"), F.col("total_revenue").desc())
)
revenue_per_category_product.show()

+-----------+-----------+-------------+
|   category|    product|total_revenue|
+-----------+-----------+-------------+
|Electronics|     Laptop|    962496295|
|Electronics|     Tablet|    960719999|
|Electronics|     Mobile|    944352576|
|    Fashion|      Jeans|    951286127|
|    Fashion|      Shoes|    946799102|
|    Fashion|     Tshirt|    936096943|
|    Grocery|        Oil|    963572869|
|    Grocery|       Rice|    954494237|
|    Grocery|      Sugar|    948205000|
|       Home|     Vacuum|    959149427|
|       Home|      Mixer|    957140026|
|       Home|Airpurifier|    952178123|
+-----------+-----------+-------------+



In [227]:
orders_completed_1.unpersist()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, order_date: date, status: string]

#Explain why over-caching is dangerous
Consumes memory:Cached data hogs RAM and can crash jobs.\
Hurts speed \
Hard cleanup :Forgetting unpersist() leads to memory leaks.

In [211]:
df_completed_clean=orders_completed.dropna(subset=["amount"])

In [212]:
df_completed_clean.count()

261095

In [213]:
df_completed_clean=df_completed_clean.dropna(subset=["order_date"])

In [214]:
df_completed_clean.filter(F.col("order_date").isNull()).count()

0

 Write the cleaned order-level dataset to Parquet
. Partition the Parquet output by city

In [234]:
df_completed_clean.write.mode("overwrite").partitionBy("city").parquet("data/clean_orders")

. Write aggregated analytics to ORC
. Read both formats back and validate schema



In [235]:
revenue_per_category_product.write.mode("overwrite").orc("data/revenue_orders")

In [236]:
df_parquet=spark.read.parquet("data/clean_orders")

In [239]:
df_parquet.show(5)
df_parquet.printSchema()

+-----------+-----------+-----------+-----------+------+----------+---------+---------+
|   order_id|customer_id|   category|    product|amount|order_date|   status|     city|
+-----------+-----------+-----------+-----------+------+----------+---------+---------+
|ORD00000023|    C000023|Electronics|     Mobile| 12000|2024-01-24|Completed|Hyderabad|
|ORD00000043|    C000043|       Home|Airpurifier| 82061|2024-02-13|Completed|Hyderabad|
|ORD00000051|    C000051|    Grocery|        Oil| 29497|2024-02-21|Completed|Hyderabad|
|ORD00000053|    C000053|       Home|Airpurifier| 74634|2024-02-23|Completed|Hyderabad|
|ORD00000056|    C000056|    Grocery|      Sugar| 39461|2024-02-26|Completed|Hyderabad|
+-----------+-----------+-----------+-----------+------+----------+---------+---------+
only showing top 5 rows
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: inte

In [240]:
df_orc=spark.read.orc("data/revenue_orders")

In [241]:
df_orc.show()
df_orc.printSchema()

+-----------+-----------+-------------+
|   category|    product|total_revenue|
+-----------+-----------+-------------+
|Electronics|     Laptop|    962496295|
|Electronics|     Tablet|    960719999|
|Electronics|     Mobile|    944352576|
|    Fashion|      Jeans|    951286127|
|    Fashion|      Shoes|    946799102|
|    Fashion|     Tshirt|    936096943|
|    Grocery|        Oil|    963572869|
|    Grocery|       Rice|    954494237|
|    Grocery|      Sugar|    948205000|
|       Home|     Vacuum|    959149427|
|       Home|      Mixer|    957140026|
|       Home|Airpurifier|    952178123|
+-----------+-----------+-------------+

root
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- total_revenue: long (nullable = true)



NO of files parquet 7 since 7 distinct cities(partition by city) \
orc no of files:1

group by can cause wide shuffle

 Explain why the following line breaks pipelines:
df = df.filter(df.amount > 50000).show()

it breaks since show output is none . it cant be assigned

 Create a scenario that produces a NoneType error

In [215]:
df = df_csv.filter(df_csv.amount > 50000).show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000009|    C000009|  Kolkata|Electronics|     Laptop| 63715|2024-01-10|Completed|
|ORD00000010|    C000010|Bangalore|    Grocery|      Sugar| 66576|2024-01-11|Completed|
|ORD00000011|    C000011|  Kolkata|Electronics|     Tablet| 50318|2024-01-12|Completed|
|ORD00000012|    C000012|Bangalore|    Grocery|      Sugar| 84768|2024-01-13|Completed|
|ORD00000013|    C000013|     Pune|    Fashion|     Tshirt| 79121|2024-01-14|Completed|
|ORD00000014|    C000014|   Mumbai|Electronics|     Tablet| 79469|2024-01-15|Completed|
|ORD00000015|    C000015|     Pune|Electronics|     Mobile| 81018|2024-01-16|Completed|
|ORD00000016|    C000016|   Mumb

In [216]:
df.show()

AttributeError: 'NoneType' object has no attribute 'show'

# Explain how you would debug a slow Spark job

->check partitioning \
->reading / writting excessive data \
-> over caching
->data skews \
->using spark ui identify where the process is slow

In [244]:
df_completed_clean.count()


258834

In [245]:
df_completed_clean.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



In [246]:
df_completed_clean.filter(F.col("order_date").isNull()).count()

0

In [247]:
df_completed_clean.filter(F.col("amount").isNull()).count()

0

# Document three optimization decisions you made

->Used df.repartition("city") before groupBy to minimize shuffle and improve parallelism for city-based aggregations \
->Cached the cleaned orders DataFrame since it was reused across multiple transformations, reducing recomputation and improving performance. \
->Applied filters (e.g., selecting completed orders) before expensive operations like aggregations and writes, minimizing the amount of data processed