# DATASET 1 — SALES TRANSACTIONS

In [3]:
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]

# DATASET 2 — CUSTOMER MASTER

In [1]:
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]

# DATASET 3 — CITY CLASSIFICATION

In [7]:
city_data = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]

# PHASE 1 — DATA INGESTION & SCHEMA MANAGEMENT

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark=SparkSession.builder.appName("capstone").getOrCreate()

In [5]:
sales_schema=StructType([
    StructField("txn_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("product",StringType(),True),
    StructField("category",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("txn_date",StringType(),True),
    StructField("status",StringType(),True)
])


customer_schema=StructType([
    StructField("customer_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("segment",StringType(),True)
])


city_schema=StructType([
    StructField("city",StringType(),True),
    StructField("tier",StringType(),True)
])



In [8]:
sales_df=spark.createDataFrame(data=sales_data,schema=sales_schema)
customer_df=spark.createDataFrame(data=customer_data,schema=customer_schema)
city_df=spark.createDataFrame(data=city_data,schema=city_schema)


In [9]:
sales_df.filter(
    col("amount").isNull() | ~col("amount").rlike(r'^[0-9]+$')
).show()

+------+-------+-------+-----------+-------+----------+---------+
|txn_id|   city|product|   category| amount|  txn_date|   status|
+------+-------+-------+-----------+-------+----------+---------+
|TXN004|  Delhi| Laptop|Electronics|       |2024-01-07|Cancelled|
|TXN005|Chennai| Mobile|Electronics|invalid|2024-01-08|Completed|
|TXN006| Mumbai| Tablet|Electronics|   NULL|2024-01-08|Completed|
+------+-------+-------+-----------+-------+----------+---------+



# PHASE 2 — DATA CLEANING & TRANSFORMATION

In [10]:
clean_df=sales_df\
.withColumn("city",trim(col("city")))\
.withColumn("product",trim(col("product")))\
.withColumn("category",trim(col("category")))\
.withColumn("status",trim(col("status")))


In [11]:
clean_df=clean_df.withColumn(
    "amount_int",
    when(col("amount").rlike("^[0-9]+$"),col("amount").cast(IntegerType()))
)

In [13]:
clean_df=clean_df.filter(col("amount_int").isNotNull())

In [21]:
clean_df=clean_df.withColumn("txn_date_parsed",coalesce(
                             try_to_timestamp(col("txn_date"), lit("yyyy-MM-dd")),
                             try_to_timestamp(col("txn_date"), lit("dd-MM-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM-dd-yyyy")),
                             try_to_timestamp(col("txn_date"), lit("dd/MM/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("MM/dd/yyyy")),
                             try_to_timestamp(col("txn_date"), lit("yyyy/MM/dd"))
                                                          ).cast(DateType()))

In [16]:
clean_df=clean_df.filter(col("status").like("Comple%"))

In [22]:
clean_df.show()

+------+---------+-------+-----------+------+----------+---------+----------+---------------+
|txn_id|     city|product|   category|amount|  txn_date|   status|amount_int|txn_date_parsed|
+------+---------+-------+-----------+------+----------+---------+----------+---------------+
|TXN001|    Delhi| Laptop|Electronics| 45000|2024-01-05|Completed|     45000|     2024-01-05|
|TXN002|   Mumbai| Mobile|electronics| 32000|05/01/2024|Completed|     32000|     2024-01-05|
|TXN003|Bangalore| Tablet|Electronics| 30000|2024/01/06|Completed|     30000|     2024-01-06|
|TXN007|    Delhi| Laptop|electronics| 45000|09-01-2024|Completed|     45000|     2024-01-09|
|TXN008|Bangalore| Mobile|Electronics| 28000|2024-01-09|Completed|     28000|     2024-01-09|
|TXN009|   Mumbai| Laptop|Electronics| 55000|2024-01-10|Completed|     55000|     2024-01-10|
|TXN009|   Mumbai| Laptop|Electronics| 55000|2024-01-10|Completed|     55000|     2024-01-10|
+------+---------+-------+-----------+------+----------+----

# PHASE 3 — DATA ENRICHMENT & JOINS

In [23]:
enriched_df=clean_df.join(broadcast(city_df),on="city",how="left")

In [24]:
enriched_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Project [txn_id#10, city#44, product#45, category#46, amount#14, txn_date#15, status#47, amount_int#48, cast(coalesce(try_to_timestamp(txn_date#15, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#15, Some(dd-MM-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#15, Some(MM-dd-yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#15, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#15, Some(MM/dd/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(txn_date#15, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) as date) AS txn_date_parsed#108]
:  +- Filter status#47 LIKE Comple%
:     +- Project [txn_id#10, city#44, product#45, category#46, amount#14, txn_date#15, status#47, amount_int#48, coalesce(to_date(txn_date#15, Some(yyyy-MM-dd), Some(Etc/UTC), true), to_date(txn_date#15, Some(dd-MM-yyyy), Some(Etc/UT

In [25]:
enriched_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)



# PHASE 4 — ANALYTICS & WINDOW FUNCTIONS

In [26]:
revenue_city=enriched_df\
.groupBy("city")\
.agg(sum("amount_int").alias("total_revenue"))

In [27]:
revenue_product=enriched_df\
.groupBy("product")\
.agg(sum("amount_int").alias("total_revenue"))

In [28]:
city_rank_window=Window.orderBy(desc("total_revenue"))
revenue_city=revenue_city.withColumn("city_rank",rank().over(city_rank_window))

In [29]:
product_city_window=Window.partitionBy("city").orderBy(desc("amount_int"))
ranked_products=enriched_df.withColumn(
    "product_rank",
    rank().over(product_city_window)
)

In [31]:
daily_city_window=Window.partitionBy("txn_date_parsed")\
.orderBy(desc("amount_int"))

top_city_daily=enriched_df.withColumn(
    "rank",
    rank().over(daily_city_window)
).filter(col("rank")==1)

# PHASE 5 — CACHING, PARTITIONS & OPTIMIZATION

In [32]:
enriched_df.cache()
enriched_df.count()

7

In [34]:
partitioned_df=enriched_df.repartition("city")

# PHASE 6 — FILE FORMAT STRATEGY

In [35]:
partitioned_df.write\
.mode("overwrite")\
.partitionBy("city")\
.parquet("/data/clean_sales_parquet")

In [37]:
revenue_city.write\
.mode("overwrite")\
.orc("/data/agg_revenue_orc")

29. In this notebook, Avro was not used because Parquet and ORC formats were chosen for data storage. These are both columnar storage formats, which are highly optimized for analytical queries (like the aggregations and window functions performed here). They offer excellent compression, predicate pushdown, and efficient read performance for analytical workloads.

While Avro is also a robust data serialization format, often favored for schema evolution and row-oriented processing (e.g., streaming data or messaging systems), Parquet and ORC are generally preferred for batch analytics and data warehousing scenarios due to their columnar nature. For the tasks demonstrated in this notebook, the benefits offered by Parquet and ORC align well with the analytical requirements.

# PHASE 7 — DEBUGGING & ERROR HANDLING

In [39]:
sales_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [txn_id#10, city#11, product#12, category#13, amount#14, txn_date#15, status#16], false

== Analyzed Logical Plan ==
txn_id: string, city: string, product: string, category: string, amount: string, txn_date: string, status: string
LogicalRDD [txn_id#10, city#11, product#12, category#13, amount#14, txn_date#15, status#16], false

== Optimized Logical Plan ==
LogicalRDD [txn_id#10, city#11, product#12, category#13, amount#14, txn_date#15, status#16], false

== Physical Plan ==
*(1) Scan ExistingRDD[txn_id#10,city#11,product#12,category#13,amount#14,txn_date#15,status#16]



In [40]:
customer_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [customer_id#17, city#18, segment#19], false

== Analyzed Logical Plan ==
customer_id: string, city: string, segment: string
LogicalRDD [customer_id#17, city#18, segment#19], false

== Optimized Logical Plan ==
LogicalRDD [customer_id#17, city#18, segment#19], false

== Physical Plan ==
*(1) Scan ExistingRDD[customer_id#17,city#18,segment#19]



In [41]:
city_df.explain(True)

== Parsed Logical Plan ==
LogicalRDD [city#20, tier#21], false

== Analyzed Logical Plan ==
city: string, tier: string
LogicalRDD [city#20, tier#21], false

== Optimized Logical Plan ==
LogicalRDD [city#20, tier#21], false

== Physical Plan ==
*(1) Scan ExistingRDD[city#20,tier#21]



# PHASE 8 — FINAL VALIDATION & DELIVERABLES

In [45]:
clean_df.count()
enriched_df.count()

7

In [46]:
enriched_df.select([
    count(when(col(c).isNull(),c)).alias(c)
    for c in ["txn_id","city","amount_int","txn_date_parsed"]
]).show()


+------+----+----------+---------------+
|txn_id|city|amount_int|txn_date_parsed|
+------+----+----------+---------------+
|     0|   0|         0|              0|
+------+----+----------+---------------+



In [47]:
enriched_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- txn_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- amount_int: integer (nullable = true)
 |-- txn_date_parsed: date (nullable = true)
 |-- tier: string (nullable = true)

