In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName('Customer Purchase Behavior & Loyalty Analysis using PySpark') \
.getOrCreate()

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

#PHASE 1 – Ingestion & Cleaning



1. Read orders.csv as all StringType.


In [3]:
orders_raw = spark.read \
.option("header", "true") \
.option("inferSchema", "false") \
.csv("orders.csv")

2. Trim text columns.


In [4]:
orders_clean_text = orders_raw\
.withColumn("city_clean", trim(col("city")))\
.withColumn("category_clean", trim(col("category")))\
.withColumn("product_clean", trim(col("product")))
orders_clean_text.show()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|city_clean|category_clean|product_clean|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled| hyderabad|       grocery|          Oil|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|      Pune|       Grocery|        Sugar|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|      Pune|   Electronics|       Mobile|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed| Bangalore|   Electronics|       Laptop|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Com

3. Normalize city, category, product.


In [7]:
orders_clean_text = orders_clean_text \
    .withColumn("city_clean", lower(col("city_clean"))) \
    .withColumn("category_clean", lower(col("category_clean"))) \
    .withColumn("product_clean", lower(col("product_clean")))
orders_clean_text.show()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|city_clean|category_clean|product_clean|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled| hyderabad|       grocery|          oil|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|      pune|       grocery|        sugar|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|      pune|   electronics|       mobile|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed| bangalore|   electronics|       laptop|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Com

4. Clean amount:
Remove commas
Convert to IntegerType
Handle invalid values safely.


In [6]:
orders_amount_clean = orders_clean_text\
.withColumn("amount_clean", regexp_replace(col("amount"), ",", ""))\
.withColumn("amount_clean",
    when(col("amount_clean").rlike("^[0-9]+$"), col("amount_clean").cast(IntegerType()))\
    .otherwise(None)
)
orders_amount_clean.show(5)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+------------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|city_clean|category_clean|product_clean|amount_clean|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+------------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled| hyderabad|       grocery|          Oil|        NULL|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|      Pune|       Grocery|        Sugar|       35430|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|      Pune|   Electronics|       Mobile|       65358|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed| Bangalore|   Electronics|       Laptop|      

5. Parse order_date into DateType → order_date_clean .


In [8]:
orders_date_clean = orders_amount_clean\
.withColumn(
    "order_date_clean",
    coalesce(
    try_to_timestamp(col("order_date"), lit("yyyy-MM-dd")).cast(DateType()),
    try_to_timestamp(col("order_date"), lit("dd/MM/yyyy")).cast(DateType()),
    try_to_timestamp(col("order_date"), lit("yyyy/MM/dd")).cast(DateType())
))
orders_date_clean.show(5)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+------------+----------------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|city_clean|category_clean|product_clean|amount_clean|order_date_clean|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+----------+--------------+-------------+------------+----------------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled| hyderabad|       grocery|          Oil|        NULL|      2024-01-01|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|      Pune|       Grocery|        Sugar|       35430|      2024-01-02|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|      Pune|   Electronics|       Mobile|       65358|      2024-01-03|
|ORD00000003|    C000003|  Bangalo

6. Remove duplicate order_id.


In [11]:
orders_deduped = orders_date_clean.groupBy("order_id").count().filter(col("count") > 1)
orders_deduped.show()

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



7. Keep only Completed orders.
From this point onward, the dataset is considered clean_orders_df.

In [13]:
clean_orders_df = orders_date_clean.filter(col("status") == "Completed")\
    .dropDuplicates(["order_id"])
clean_orders_df.show(5)

+-----------+-----------+---------+-----------+-------+------+----------+---------+----------+--------------+-------------+------------+----------------+
|   order_id|customer_id|     city|   category|product|amount|order_date|   status|city_clean|category_clean|product_clean|amount_clean|order_date_clean|
+-----------+-----------+---------+-----------+-------+------+----------+---------+----------+--------------+-------------+------------+----------------+
|ORD00000001|    C000001|     Pune|    Grocery|  Sugar| 35430|2024-01-02|Completed|      Pune|       Grocery|        Sugar|       35430|      2024-01-02|
|ORD00000007|    C000007|     Pune|    Grocery|   Rice| 45362|2024-01-08|Completed|      Pune|       Grocery|         Rice|       45362|      2024-01-08|
|ORD00000008|    C000008|Bangalore|    Fashion|  Jeans| 10563|2024-01-09|Completed| Bangalore|       Fashion|        Jeans|       10563|      2024-01-09|
|ORD00000010|    C000010|Bangalore|    Grocery|  Sugar| 66576|2024-01-11|Com

#PHASE 2 – Customer Metrics




1. Total number of orders.


In [14]:
total_orders = clean_orders_df.count()
print(f"Total number of orders: {total_orders}")

Total number of orders: 285000


2. Total spending.


In [15]:
total_spending = clean_orders_df.select(sum(col("amount_clean"))).collect()[0][0]
print(f"Total spending: {total_spending}")

Total spending: 11436490724


3. Average order value.


In [16]:
average_order_value = total_spending / total_orders
print(f"Average order value: {average_order_value:.2f}")

Average order value: 40128.04


4. First purchase date.


In [17]:
first_purchase_date = clean_orders_df.select(min(col("order_date_clean"))).collect()[0][0]
print(f"First purchase date: {first_purchase_date}")

First purchase date: 2024-01-02


5. Last purchase date.


In [18]:
last_purchase_date = clean_orders_df.select(max(col("order_date_clean"))).collect()[0][0]
print(f"Last purchase date: {last_purchase_date}")

Last purchase date: 2024-02-29


6. Number of distinct cities ordered from.


In [19]:
distinct_cities = clean_orders_df.select("city_clean").distinct().count()
print(f"Number of distinct cities ordered from: {distinct_cities}")

Number of distinct cities ordered from: 14


7. Number of distinct categories ordered from.

In [20]:
distinct_categories = clean_orders_df.select("category_clean").distinct().count()
print(f"Number of distinct categories ordered from: {distinct_categories}")

Number of distinct categories ordered from: 8


#PHASE 3 – Customer Segmentation


Create customer segments using business logic:

Total Spend >= 200000 AND Orders >= 5 → "VIP"
Total Spend >= 100000 → "Premium"
Else → "Regular"

Add a column:

customer_segment

Count customers in each segment.

In [21]:
customer_metrics = clean_orders_df.groupBy("customer_id").agg(
    sum("amount_clean").alias("total_spend"),
    count("order_id").alias("total_orders")
)

customer_segments = customer_metrics.withColumn("customer_segment",
    when((col("total_spend") >= 200000) & (col("total_orders") >= 5), "VIP")
    .when(col("total_spend") >= 100000, "Premium")
    .otherwise("Regular")
)

print("Customer segments and their counts:")
customer_segments.groupBy("customer_segment").count().show()

Customer segments and their counts:
+----------------+-----+
|customer_segment|count|
+----------------+-----+
|         Premium|12485|
|         Regular|  623|
|             VIP|34392|
+----------------+-----+



#PHASE 4 – Window Functions



1. Rank customers by total spending (overall).


In [22]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, dense_rank

window_spec_overall = Window.orderBy(col("total_spend").desc())

customer_ranking_overall = customer_metrics.withColumn("overall_rank", dense_rank().over(window_spec_overall))

print("Customers ranked by total spending (overall):")
customer_ranking_overall.orderBy("overall_rank").show(10)

Customers ranked by total spending (overall):
+-----------+-----------+------------+------------+
|customer_id|total_spend|total_orders|overall_rank|
+-----------+-----------+------------+------------+
|    C043076|     493949|           6|           1|
|    C034689|     486879|           6|           2|
|    C039985|     484057|           6|           3|
|    C026691|     477147|           6|           4|
|    C038979|     477138|           6|           5|
|    C020762|     474717|           6|           6|
|    C044654|     471304|           6|           7|
|    C014292|     468617|           6|           8|
|    C019565|     467523|           6|           9|
|    C045487|     467050|           6|          10|
+-----------+-----------+------------+------------+
only showing top 10 rows


2. Rank customers inside each city by total spending.


In [23]:
customer_city_metrics = clean_orders_df.groupBy("customer_id", "city_clean").agg(
    sum("amount_clean").alias("total_spend"),
    count("order_id").alias("total_orders")
)

window_spec_city = Window.partitionBy("city_clean").orderBy(col("total_spend").desc())

customer_ranking_city = customer_city_metrics.withColumn("city_rank", dense_rank().over(window_spec_city))

print("Customers ranked by total spending within each city:")
customer_ranking_city.orderBy("city_clean", "city_rank").show(10)

Customers ranked by total spending within each city:
+-----------+----------+-----------+------------+---------+
|customer_id|city_clean|total_spend|total_orders|city_rank|
+-----------+----------+-----------+------------+---------+
|    C011518| Bangalore|     332527|           5|        1|
|    C024935| Bangalore|     315622|           4|        2|
|    C025451| Bangalore|     303208|           4|        3|
|    C008486| Bangalore|     300843|           5|        4|
|    C039191| Bangalore|     294970|           4|        5|
|    C006114| Bangalore|     290915|           4|        6|
|    C028773| Bangalore|     285105|           4|        7|
|    C045363| Bangalore|     283538|           4|        8|
|    C043646| Bangalore|     272357|           4|        9|
|    C032542| Bangalore|     262148|           4|       10|
+-----------+----------+-----------+------------+---------+
only showing top 10 rows


3. Identify top 3 customers per city.


In [24]:
top_3_customers_per_city = customer_ranking_city.filter(col("city_rank") <= 3)

print("Top 3 customers per city:")
top_3_customers_per_city.orderBy("city_clean", "city_rank").show()

Top 3 customers per city:
+-----------+----------+-----------+------------+---------+
|customer_id|city_clean|total_spend|total_orders|city_rank|
+-----------+----------+-----------+------------+---------+
|    C011518| Bangalore|     332527|           5|        1|
|    C024935| Bangalore|     315622|           4|        2|
|    C025451| Bangalore|     303208|           4|        3|
|    C028121|   Chennai|     340890|           5|        1|
|    C027841|   Chennai|     287392|           5|        2|
|    C030712|   Chennai|     284466|           4|        3|
|    C016309|     Delhi|     325001|           5|        1|
|    C022599|     Delhi|     314625|           4|        2|
|    C018688|     Delhi|     306692|           4|        3|
|    C032833| Hyderabad|     318097|           5|        1|
|    C023269| Hyderabad|     292791|           5|        2|
|    C013263| Hyderabad|     291679|           4|        3|
|    C032246|   Kolkata|     304480|           4|        1|
|    C028450| 

4. Identify top 10 customers across all cities.
This phase must use:

Window.partitionBy()

In [25]:
window_spec_global = Window.partitionBy(lit(1)).orderBy(col("total_spend").desc())

top_10_customers_overall = customer_metrics.withColumn("global_rank", dense_rank().over(window_spec_global))

print("Top 10 customers across all cities:")
top_10_customers_overall.filter(col("global_rank") <= 10).orderBy("global_rank").show()

Top 10 customers across all cities:
+-----------+-----------+------------+-----------+
|customer_id|total_spend|total_orders|global_rank|
+-----------+-----------+------------+-----------+
|    C043076|     493949|           6|          1|
|    C034689|     486879|           6|          2|
|    C039985|     484057|           6|          3|
|    C026691|     477147|           6|          4|
|    C038979|     477138|           6|          5|
|    C020762|     474717|           6|          6|
|    C044654|     471304|           6|          7|
|    C014292|     468617|           6|          8|
|    C019565|     467523|           6|          9|
|    C045487|     467050|           6|         10|
+-----------+-----------+------------+-----------+



#PHASE 5 – Customer Loyalty Analysis
Define loyalty:
A loyal customer is one who:
Has purchases on at least 3 different dates
Has ordered from at least 2 different categories
Tasks:


1. Identify loyal customers.


In [27]:
from pyspark.sql.functions import countDistinct, col

customer_loyalty_metrics = clean_orders_df.groupBy("customer_id").agg(
    countDistinct("order_date_clean").alias("distinct_purchase_dates"),
    countDistinct("category_clean").alias("distinct_categories_ordered")
)

loyal_customers = customer_loyalty_metrics.filter(
    (col("distinct_purchase_dates") >= 3) & (col("distinct_categories_ordered") >= 2)
)

print("Loyal Customers:")
loyal_customers.show(5)
print(f"Total loyal customers: {loyal_customers.count()}")

Loyal Customers:
+-----------+-----------------------+---------------------------+
|customer_id|distinct_purchase_dates|distinct_categories_ordered|
+-----------+-----------------------+---------------------------+
|    C041802|                      3|                          4|
|    C027664|                      3|                          3|
|    C030828|                      3|                          3|
|    C030695|                      3|                          4|
|    C041085|                      3|                          4|
+-----------+-----------------------+---------------------------+
only showing top 5 rows
Total loyal customers: 47450


2. Count loyal customers per city.


In [28]:
customer_city_map = clean_orders_df.select("customer_id", "city_clean").distinct()

loyal_customers_with_city = loyal_customers.join(customer_city_map, "customer_id", "inner")

loyal_customers_per_city = loyal_customers_with_city.groupBy("city_clean").count().alias("loyal_customer_count")

print("Loyal Customers per City:")
loyal_customers_per_city.orderBy(col("count").desc()).show()

Loyal Customers per City:
+----------+-----+
|city_clean|count|
+----------+-----+
| Hyderabad|27683|
|     Delhi|27669|
|      Pune|27617|
|   Chennai|27519|
|   Kolkata|27477|
|    Mumbai|27454|
| Bangalore|27405|
| hyderabad| 2486|
|   chennai| 2415|
|      pune| 2391|
|   kolkata| 2390|
|     delhi| 2379|
|    mumbai| 2348|
| bangalore| 2345|
+----------+-----+



3. Compare loyal vs non-loyal customer revenue contribution.

In [29]:
loyal_customer_ids = loyal_customers.select("customer_id")

revenue_loyal = customer_metrics.join(loyal_customer_ids, "customer_id", "inner")\
    .select(sum("total_spend")).collect()[0][0]

revenue_non_loyal = customer_metrics.join(loyal_customer_ids, "customer_id", "left_anti")\
    .select(sum("total_spend")).collect()[0][0]

print(f"Total revenue from loyal customers: {revenue_loyal}")
print(f"Total revenue from non-loyal customers: {revenue_non_loyal}")
print(f"Difference (Loyal - Non-Loyal): {revenue_loyal - revenue_non_loyal}")

Total revenue from loyal customers: 11423898941
Total revenue from non-loyal customers: 12591783
Difference (Loyal - Non-Loyal): 11411307158


#PHASE 6 – Time-Based Analysis
Using order_date_clean:


1. Compute monthly revenue per city.


In [31]:
monthly_revenue_city = clean_orders_df.filter(col("order_date_clean").isNotNull())\
    .withColumn("city_clean", lower(col("city_clean")))\
    .withColumn("order_month", date_trunc("month", col("order_date_clean")))\
    .groupBy("order_month", "city_clean").agg(sum("amount_clean").alias("monthly_revenue"))

print("Monthly Revenue per City:")
monthly_revenue_city.orderBy("order_month", col("monthly_revenue").desc()).show(10)

Monthly Revenue per City:
+-------------------+----------+---------------+
|        order_month|city_clean|monthly_revenue|
+-------------------+----------+---------------+
|2024-01-01 00:00:00|      pune|      833507124|
|2024-01-01 00:00:00| hyderabad|      833063605|
|2024-01-01 00:00:00|   kolkata|      824920456|
|2024-01-01 00:00:00| bangalore|      822339117|
|2024-01-01 00:00:00|   chennai|      818567389|
|2024-01-01 00:00:00|     delhi|      817332633|
|2024-01-01 00:00:00|    mumbai|      816636150|
|2024-02-01 00:00:00|     delhi|      805877007|
|2024-02-01 00:00:00|      pune|      797779557|
|2024-02-01 00:00:00|   chennai|      796361427|
+-------------------+----------+---------------+
only showing top 10 rows


2. Compute monthly order count per category.


In [33]:
monthly_order_count_category = clean_orders_df.filter(col("order_date_clean").isNotNull())\
    .withColumn("order_month", date_trunc("month", col("order_date_clean")))\
    .withColumn("category_clean", lower(col("category_clean")))\
    .groupBy("order_month", "category_clean").agg(count("order_id").alias("monthly_order_count"))

print("Monthly Order Count per Category:")
monthly_order_count_category.orderBy("order_month", col("monthly_order_count").desc()).show(10)

Monthly Order Count per Category:
+-------------------+--------------+-------------------+
|        order_month|category_clean|monthly_order_count|
+-------------------+--------------+-------------------+
|2024-01-01 00:00:00|          home|              36163|
|2024-01-01 00:00:00|       grocery|              36018|
|2024-01-01 00:00:00|   electronics|              35994|
|2024-01-01 00:00:00|       fashion|              35571|
|2024-02-01 00:00:00|   electronics|              34766|
|2024-02-01 00:00:00|       fashion|              34720|
|2024-02-01 00:00:00|       grocery|              34672|
|2024-02-01 00:00:00|          home|              34631|
+-------------------+--------------+-------------------+



3. Identify growth or decline trends.

In [34]:
window_spec_month_city = Window.partitionBy("city_clean").orderBy("order_month")

revenue_trends = monthly_revenue_city.withColumn("previous_month_revenue", lag("monthly_revenue", 1).over(window_spec_month_city))

revenue_trends = revenue_trends.withColumn("revenue_change_percent",
    when(col("previous_month_revenue").isNotNull(),
        (col("monthly_revenue") - col("previous_month_revenue")) / col("previous_month_revenue") * 100
    ).otherwise(None)
)

print("Monthly Revenue Growth/Decline Trends:")
revenue_trends.orderBy("city_clean", "order_month").show()

window_spec_month_category = Window.partitionBy("category_clean").orderBy("order_month")

order_count_trends = monthly_order_count_category.withColumn("previous_month_orders", lag("monthly_order_count", 1).over(window_spec_month_category))

order_count_trends = order_count_trends.withColumn("order_count_change_percent",
    when(col("previous_month_orders").isNotNull(),
        (col("monthly_order_count") - col("previous_month_orders")) / col("previous_month_orders") * 100
    ).otherwise(None)
)

print("\nMonthly Order Count Growth/Decline Trends:")
order_count_trends.orderBy("category_clean", "order_month").show()

Monthly Revenue Growth/Decline Trends:
+-------------------+----------+---------------+----------------------+----------------------+
|        order_month|city_clean|monthly_revenue|previous_month_revenue|revenue_change_percent|
+-------------------+----------+---------------+----------------------+----------------------+
|2024-01-01 00:00:00| bangalore|      822339117|                  NULL|                  NULL|
|2024-02-01 00:00:00| bangalore|      792163305|             822339117|   -3.6695094975033276|
|2024-01-01 00:00:00|   chennai|      818567389|                  NULL|                  NULL|
|2024-02-01 00:00:00|   chennai|      796361427|             818567389|   -2.7127836142028374|
|2024-01-01 00:00:00|     delhi|      817332633|                  NULL|                  NULL|
|2024-02-01 00:00:00|     delhi|      805877007|             817332633|   -1.4015867637576633|
|2024-01-01 00:00:00| hyderabad|      833063605|                  NULL|                  NULL|
|2024-02-01

#PHASE 7 – Performance Engineering

1. Identify which DataFrames are reused.


In [35]:
print("Reused DataFrames:")
print("- `clean_orders_df`: Used for various customer metrics, customer segmentation, loyalty analysis, and time-based analysis.")
print("- `customer_metrics`: Used for customer segmentation, overall customer ranking, and loyal vs. non-loyal customer revenue comparison.")


Reused DataFrames:
- `clean_orders_df`: Used for various customer metrics, customer segmentation, loyalty analysis, and time-based analysis.
- `customer_metrics`: Used for customer segmentation, overall customer ranking, and loyal vs. non-loyal customer revenue comparison.


2. Apply caching.


In [36]:
clean_orders_df.cache()
customer_metrics.cache()
print("DataFrames `clean_orders_df` and `customer_metrics` have been cached.")

DataFrames `clean_orders_df` and `customer_metrics` have been cached.


3. Use explain(True) on:
Customer aggregation
Window ranking


In [37]:
print("Execution plan for Customer Aggregation (customer_metrics):")
customer_metrics.explain(True)

print("\nExecution plan for Overall Window Ranking (customer_ranking_overall):")
customer_ranking_overall.explain(True)

print("\nExecution plan for City-wise Window Ranking (customer_ranking_city):")
customer_ranking_city.explain(True)

Execution plan for Customer Aggregation (customer_metrics):
== Parsed Logical Plan ==
'Aggregate ['customer_id], ['customer_id, 'sum('amount_clean) AS total_spend#699, 'count('order_id) AS total_orders#700]
+- Deduplicate [order_id#17]
   +- Filter (status#24 = Completed)
      +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_clean#26, category_clean#27, product_clean#28, amount_clean#72, coalesce(cast(try_to_timestamp(order_date#23, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#23, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#23, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false) as date)) AS order_date_clean#163]
         +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_clean#26, category_clean#27, product_clean#28, CASE WHEN RLIKE(amount_clea

4. Identify shuffle stages.


In [38]:
print("Shuffle stages (indicated by 'Exchange hashpartitioning' or 'Exchange SinglePartition'):")
print("1. Customer Aggregation (`customer_metrics`):")
print("   - `Exchange hashpartitioning(customer_id#18, 200)`: This shuffle happens before the final hash aggregation to ensure all data for a given customer_id is on the same partition, allowing `sum` and `count` to be computed correctly.")
print("   - `Exchange hashpartitioning(order_id#17, 200)`: This shuffle happens during the deduplication process within `clean_orders_df` creation to group records by order_id.")

print("\n2. Overall Window Ranking (`customer_ranking_overall`):")
print("   - `Exchange SinglePartition`: This shuffle occurs before the window function application when ordering globally. All data is collected into a single partition to allow for a global ranking based on `total_spend`. This can be a bottleneck for very large datasets.")

print("\n3. City-wise Window Ranking (`customer_ranking_city`):")
print("   - `Exchange hashpartitioning(city_clean#26, 200)`: This shuffle occurs before the window function application to group data by `city_clean`. This ensures that the ranking for each city happens independently within its own partition.")
print("   - `Exchange hashpartitioning(customer_id#18, city_clean#26, 200)`: This shuffle happens during the aggregation for `customer_city_metrics` to group data by both `customer_id` and `city_clean`.")
print("   - Additional `Exchange hashpartitioning(order_id#17, 200)` stages originating from `clean_orders_df` as it's reused.")


Shuffle stages (indicated by 'Exchange hashpartitioning' or 'Exchange SinglePartition'):
1. Customer Aggregation (`customer_metrics`):
   - `Exchange hashpartitioning(customer_id#18, 200)`: This shuffle happens before the final hash aggregation to ensure all data for a given customer_id is on the same partition, allowing `sum` and `count` to be computed correctly.
   - `Exchange hashpartitioning(order_id#17, 200)`: This shuffle happens during the deduplication process within `clean_orders_df` creation to group records by order_id.

2. Overall Window Ranking (`customer_ranking_overall`):
   - `Exchange SinglePartition`: This shuffle occurs before the window function application when ordering globally. All data is collected into a single partition to allow for a global ranking based on `total_spend`. This can be a bottleneck for very large datasets.

3. City-wise Window Ranking (`customer_ranking_city`):
   - `Exchange hashpartitioning(city_clean#26, 200)`: This shuffle occurs before the

5. Justify any repartitioning strategy.

In [39]:
print("Justification of Repartitioning Strategy:")
print("Spark performs repartitioning (shuffles) for several key reasons, primarily to ensure data correctness and enable parallel processing for certain operations. The identified shuffle stages are necessary as follows:")
print("\n1. **Hash Partitioning (e.g., `Exchange hashpartitioning(customer_id, 200)`):**")
print("   - **Purpose**: Used for operations that require grouping data by one or more keys, such as aggregations (sum, count) and window functions partitioned by specific columns. By hashing the key(s), Spark ensures that all data points belonging to a particular key are co-located on the same partition. This is fundamental for correctly computing results for each group.")
print("   - **Examples in our pipeline**:")
print("     - Aggregation for `customer_metrics` (`hashpartitioning(customer_id)`): Ensures all orders for a given customer are on the same partition to accurately sum their spending and count their orders.")
print("     - Deduplication within `clean_orders_df` (`hashpartitioning(order_id)`): Groups identical `order_id`s together to facilitate efficient removal of duplicates.")
print("     - City-wise window ranking (`hashpartitioning(city_clean)` and `hashpartitioning(customer_id, city_clean)`): Ensures that all data for a specific city, or customer within a city, is processed together, allowing for correct city-specific aggregations and rankings.")

print("\n2. **Single Partition (e.g., `Exchange SinglePartition`):**")
print("   - **Purpose**: This strategy is employed when an operation requires a global ordering or processing of the entire dataset as a single unit, such as an overall ranking (`dense_rank()` across all customers). All data is collected into one partition to establish a single, coherent order.")
print("   - **Example in our pipeline**:")
print("     - Overall Window Ranking (`customer_ranking_overall`): To rank customers by `total_spend` globally, all data must be sorted and processed together. While ensuring correctness, this can be a performance bottleneck for very large datasets as it limits parallelism to a single executor.")

print("\nIn summary, shuffles are a trade-off between network I/O and disk I/O (for data movement) and computational correctness. While they can be expensive, they are often indispensable for complex data transformations and aggregations in distributed systems like Spark.")

Justification of Repartitioning Strategy:
Spark performs repartitioning (shuffles) for several key reasons, primarily to ensure data correctness and enable parallel processing for certain operations. The identified shuffle stages are necessary as follows:

1. **Hash Partitioning (e.g., `Exchange hashpartitioning(customer_id, 200)`):**
   - **Purpose**: Used for operations that require grouping data by one or more keys, such as aggregations (sum, count) and window functions partitioned by specific columns. By hashing the key(s), Spark ensures that all data points belonging to a particular key are co-located on the same partition. This is fundamental for correctly computing results for each group.
   - **Examples in our pipeline**:
     - Aggregation for `customer_metrics` (`hashpartitioning(customer_id)`): Ensures all orders for a given customer are on the same partition to accurately sum their spending and count their orders.
     - Deduplication within `clean_orders_df` (`hashpartitio

#PHASE 8 – Broadcast Join (Light Use)


Create a small lookup:

segment_code,segment_label
1,VIP
2,Premium
3,Regular

Map:

VIP → 1
Premium → 2
Regular → 3



1. Create this as a small DataFrame.


In [40]:
segment_data = [
    (1, "VIP"),
    (2, "Premium"),
    (3, "Regular")
]

segment_lookup_df = spark.createDataFrame(segment_data, ["segment_code", "segment_label"])
segment_lookup_df.show()

+------------+-------------+
|segment_code|segment_label|
+------------+-------------+
|           1|          VIP|
|           2|      Premium|
|           3|      Regular|
+------------+-------------+



2. Join with customer segmentation output.


In [41]:
from pyspark.sql.functions import broadcast

customer_segments_with_lookup = customer_segments.join(
    broadcast(segment_lookup_df),
    customer_segments["customer_segment"] == segment_lookup_df["segment_label"],
    "inner"
)

print("Customer segments joined with lookup:")
customer_segments_with_lookup.show(5)

Customer segments joined with lookup:
+-----------+-----------+------------+----------------+------------+-------------+
|customer_id|total_spend|total_orders|customer_segment|segment_code|segment_label|
+-----------+-----------+------------+----------------+------------+-------------+
|    C007013|     241427|           6|             VIP|           1|          VIP|
|    C016502|     318813|           6|             VIP|           1|          VIP|
|    C030046|     276423|           6|             VIP|           1|          VIP|
|    C036809|     284063|           6|             VIP|           1|          VIP|
|    C022166|     266454|           6|             VIP|           1|          VIP|
+-----------+-----------+------------+----------------+------------+-------------+
only showing top 5 rows


3. Force broadcast join.


In [42]:
customer_segments_with_lookup_forced_broadcast = customer_segments.join(
    broadcast(segment_lookup_df),
    customer_segments["customer_segment"] == segment_lookup_df["segment_label"],
    "inner"
)
print("Broadcast join applied successfully (if segment_lookup_df is small enough).")

Broadcast join applied successfully (if segment_lookup_df is small enough).


4. Verify BroadcastHashJoin in plan.

In [43]:
print("Execution plan for Broadcast Join verification:")
customer_segments_with_lookup_forced_broadcast.explain(True)

Execution plan for Broadcast Join verification:
== Parsed Logical Plan ==
Join Inner, (customer_segment#716 = segment_label#3143)
:- Project [customer_id#18, total_spend#699L, total_orders#700L, CASE WHEN ((total_spend#699L >= cast(200000 as bigint)) AND (total_orders#700L >= cast(5 as bigint))) THEN VIP WHEN (total_spend#699L >= cast(100000 as bigint)) THEN Premium ELSE Regular END AS customer_segment#716]
:  +- Aggregate [customer_id#18], [customer_id#18, sum(amount_clean#72) AS total_spend#699L, count(order_id#17) AS total_orders#700L]
:     +- Deduplicate [order_id#17]
:        +- Filter (status#24 = Completed)
:           +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_clean#26, category_clean#27, product_clean#28, amount_clean#72, coalesce(cast(try_to_timestamp(order_date#23, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false) as date), cast(try_to_timestamp(order_date#23, Some(dd/MM/yyyy), TimestampType, Som

#PHASE 9 – Sorting & Set Operations


1. Sort customers by:

Total spend descending
Order count descending


In [44]:
customer_metrics_sorted = customer_metrics.orderBy(col("total_spend").desc(), col("total_orders").desc())

print("Customers sorted by total spend (descending) and order count (descending):")
customer_metrics_sorted.show(10)

Customers sorted by total spend (descending) and order count (descending):
+-----------+-----------+------------+
|customer_id|total_spend|total_orders|
+-----------+-----------+------------+
|    C043076|     493949|           6|
|    C034689|     486879|           6|
|    C039985|     484057|           6|
|    C026691|     477147|           6|
|    C038979|     477138|           6|
|    C020762|     474717|           6|
|    C044654|     471304|           6|
|    C014292|     468617|           6|
|    C019565|     467523|           6|
|    C045487|     467050|           6|
+-----------+-----------+------------+
only showing top 10 rows


2. Create two sets:
Customers who bought Electronics
Customers who bought Grocery


In [45]:
customers_electronics = clean_orders_df.filter(col("category_clean") == "electronics").select("customer_id").distinct()
customers_grocery = clean_orders_df.filter(col("category_clean") == "grocery").select("customer_id").distinct()

print("Customers who bought Electronics (first 5):")
customers_electronics.show(5)
print("Customers who bought Grocery (first 5):")
customers_grocery.show(5)


Customers who bought Electronics (first 5):
+-----------+
|customer_id|
+-----------+
|    C006671|
|    C010704|
|    C003239|
|    C015794|
|    C024319|
+-----------+
only showing top 5 rows
Customers who bought Grocery (first 5):
+-----------+
|customer_id|
+-----------+
|    C042451|
|    C042423|
|    C020869|
|    C044407|
|    C007669|
+-----------+
only showing top 5 rows


3. Find:
Customers in both sets
Customers in only one set

In [46]:
customers_both_sets = customers_electronics.intersect(customers_grocery)
print("Customers who bought both Electronics and Grocery (first 5):")
customers_both_sets.show(5)
print(f"Total customers who bought both: {customers_both_sets.count()}")

customers_only_electronics = customers_electronics.exceptAll(customers_grocery)
customers_only_grocery = customers_grocery.exceptAll(customers_electronics)

customers_only_one_set = customers_only_electronics.union(customers_only_grocery)
print("\nCustomers who bought only one of Electronics or Grocery (first 5):")
customers_only_one_set.show(5)
print(f"Total customers who bought only one set: {customers_only_one_set.count()}")

Customers who bought both Electronics and Grocery (first 5):
+-----------+
|customer_id|
+-----------+
+-----------+

Total customers who bought both: 0

Customers who bought only one of Electronics or Grocery (first 5):
+-----------+
|customer_id|
+-----------+
|    C047101|
|    C031251|
|    C003053|
|    C026830|
|    C035805|
+-----------+
only showing top 5 rows
Total customers who bought only one set: 4537


#PHASE 10 – Storage Strategy


1. Write customer master dataset to:

Parquet

Partitioned by:

customer_segment



In [47]:
customer_master_df = customer_segments.alias("customer_master")

customer_master_df.write.mode("overwrite").partitionBy("customer_segment").parquet("customer_master.parquet")

print("Customer master dataset written to Parquet, partitioned by customer_segment.")

Customer master dataset written to Parquet, partitioned by customer_segment.


2. Write monthly analytics to:

ORC



In [48]:
monthly_revenue_city.write.mode("overwrite").orc("monthly_revenue_city.orc")
print("Monthly revenue per city written to ORC.")

monthly_order_count_category.write.mode("overwrite").orc("monthly_order_count_category.orc")
print("Monthly order count per category written to ORC.")

Monthly revenue per city written to ORC.
Monthly order count per category written to ORC.


3. Read back and validate.

In [49]:
customer_master_read = spark.read.parquet("customer_master.parquet")
print("Customer Master (read from Parquet):")
customer_master_read.show(5)

monthly_revenue_city_read = spark.read.orc("monthly_revenue_city.orc")
print("Monthly Revenue per City (read from ORC):")
monthly_revenue_city_read.show(5)

monthly_order_count_category_read = spark.read.orc("monthly_order_count_category.orc")
print("Monthly Order Count per Category (read from ORC):")
monthly_order_count_category_read.show(5)

Customer Master (read from Parquet):
+-----------+-----------+------------+----------------+
|customer_id|total_spend|total_orders|customer_segment|
+-----------+-----------+------------+----------------+
|    C017846|     259374|           6|             VIP|
|    C031570|     265242|           6|             VIP|
|    C008283|     329252|           6|             VIP|
|    C035348|     258062|           6|             VIP|
|    C042917|     291846|           6|             VIP|
+-----------+-----------+------------+----------------+
only showing top 5 rows
Monthly Revenue per City (read from ORC):
+-------------------+----------+---------------+
|        order_month|city_clean|monthly_revenue|
+-------------------+----------+---------------+
|2024-02-01 00:00:00| bangalore|      792163305|
|2024-01-01 00:00:00| bangalore|      822339117|
|2024-01-01 00:00:00| hyderabad|      833063605|
|2024-01-01 00:00:00|   kolkata|      824920456|
|2024-01-01 00:00:00|    mumbai|      816636150|
+

#PHASE 11 – Debugging


Explain why this is dangerous:

df = df.groupBy("customer_id").sum("amount").show()

Explain:
What df becomes
Why pipeline breaks
Correct approach

In [None]:
# Explain why this is dangerous:
# df = df.groupBy("customer_id").sum("amount").show()

# What df becomes:
# The `.show()` method is an action in Spark DataFrames. It triggers the computation and prints the results to the console. However, it returns `None`. Therefore, `df` would become `None`.

# Why pipeline breaks:
# If `df` becomes `None`, any subsequent operations or transformations attempted on `df` will raise an `AttributeError` (e.g., 'NoneType' object has no attribute 'withColumn') because `df` is no longer a DataFrame object.

# Correct approach:
# The action (`.show()`) should not be chained with an assignment back to the DataFrame variable if you intend to continue using the DataFrame. Instead, separate the action:
#
# # Option 1: Assign the result of transformations to a new DataFrame variable
# aggregated_df = df.groupBy("customer_id").sum("amount")
# aggregated_df.show()
#
# # Option 2: Perform the action as a separate step if you intend to reuse the original DataFrame reference (not recommended if the aggregation is the desired next step in the pipeline)
# df_aggregated = df.groupBy("customer_id").sum("amount")
# df_aggregated.show()