PHASE 1 : Ingestion & Cleaning
Use the same cleaning logic as Case Study 1:
1. Read orders.csv as all StringType.
2. Trim text columns.
3. Normalize city, category, product.
4. Clean amount:
Remove commas
Convert to IntegerType
Handle invalid values safely.
5. Parse order_date into DateType â†’ order_date_clean .
6. Remove duplicate order_id.
7. Keep only Completed orders.


In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [6]:
orders_csv=spark.read \
        .option("header",True)\
        .option("inferSchema",False) \
        .csv("orders.csv")
orders_csv.show(5)

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
only showing top 5 rows


In [5]:
orders_csv.printSchema()
orders_csv.count()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



300000

In [7]:
orders_csv = orders_csv.withColumn(
    "city",
    F.initcap(F.trim(F.col("city")))
)
orders_csv = orders_csv.withColumn(
    "product",
    F.initcap(F.trim(F.col("product")))
)
orders_csv = orders_csv.withColumn(
    "category",
     F.initcap(F.trim(F.col("category")))
)

In [8]:
orders_csv=orders_csv.withColumn("amount",F.regexp_replace(F.col("amount"),",",""))

In [9]:
orders_csv = orders_csv.withColumn(
    "amount",
    F.when(F.col("amount").rlike("^[0-9]+$"), F.col("amount").cast("int"))
     .otherwise(None)
)
orders_csv.filter(F.col("amount").isNotNull()).count()

274836

In [12]:
orders_csv = orders_csv.withColumn(
    "order_date_clean",
    F.coalesce(
        F.to_date("order_date", "yyyy-MM-dd"),
        F.to_date("order_date", "dd/MM/yyyy"),
        F.to_date("order_date", "yyyy/MM/dd")
    ))


In [11]:
spark.conf.set("spark.sql.ansi.enabled", "false")

In [14]:
orders_csv.show(3)
orders_csv.printSchema()

+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
|   order_id|customer_id|     city|   category|product|amount|order_date|   status|order_date_clean|
+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
|ORD00000000|    C000000|Hyderabad|    Grocery|    Oil|  NULL|01/01/2024|Cancelled|      2024-01-01|
|ORD00000001|    C000001|     Pune|    Grocery|  Sugar| 35430|2024-01-02|Completed|      2024-01-02|
|ORD00000002|    C000002|     Pune|Electronics| Mobile| 65358|2024-01-03|Completed|      2024-01-03|
+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
only showing top 3 rows
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: string (nu

In [15]:
orders_completed=orders_csv.filter(F.col("status")=="Completed")
orders_completed.count()

285000

In [16]:
clean_orders_df=orders_completed.dropna(subset=["amount"])
clean_orders_df.count()

261095

In [18]:

orders_per_customer_df = (
   clean_orders_df
    .groupBy("customer_id")
    .agg(
        F.count("order_id").alias("total_orders")
    )
)
orders_per_customer_df.show()

+-----------+------------+
|customer_id|total_orders|
+-----------+------------+
|    C000111|           5|
|    C000142|           6|
|    C000299|           5|
|    C000433|           6|
|    C000805|           6|
|    C000884|           6|
|    C001019|           6|
|    C001115|           5|
|    C001154|           6|
|    C001875|           5|
|    C002484|           6|
|    C002512|           6|
|    C002646|           5|
|    C002837|           6|
|    C003053|           5|
|    C003128|           5|
|    C003194|           5|
|    C003484|           6|
|    C003563|           5|
|    C003702|           5|
+-----------+------------+
only showing top 20 rows


In [19]:

from pyspark.sql import functions as F

customer_profile_df = (
    clean_orders_df
    .groupBy("customer_id")
    .agg(
        F.count("order_id").alias("total_orders"),
        F.sum("amount").alias("total_spend"),
        F.min("order_date_clean").alias("first_purchase_date"),
        F.max("order_date_clean").alias("last_purchase_date"),
        F.countDistinct("city").alias("distinct_cities"),
        F.countDistinct("category").alias("distinct_categories")
    )

)


In [22]:
customer_profile_df.show(3)

+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+
|customer_id|total_orders|total_spend|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+
|    C034669|           5|     170151|         2024-01-10|        2024-02-19|              5|                  4|
|    C035805|           6|     272408|         2024-01-06|        2024-02-15|              5|                  4|
|    C049309|           5|     272741|         2024-01-10|        2024-02-19|              4|                  4|
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+
only showing top 3 rows


In [23]:
customer_profile_df=customer_profile_df.withColumn(
"avg_order_value",
        F.col("total_spend") / F.col("total_orders")
    )


In [24]:
customer_profile_df.show(3)

+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+------------------+
|customer_id|total_orders|total_spend|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|   avg_order_value|
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+------------------+
|    C034669|           5|     170151|         2024-01-10|        2024-02-19|              5|                  4|           34030.2|
|    C035805|           6|     272408|         2024-01-06|        2024-02-15|              5|                  4|45401.333333333336|
|    C049309|           5|     272741|         2024-01-10|        2024-02-19|              4|                  4|           54548.2|
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+------------------+
only showing top 3 rows


In [26]:

customer_profile = customer_profile_df.orderBy("customer_id")

customer_profile.show(20, truncate=False)


+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+------------------+
|customer_id|total_orders|total_spend|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|avg_order_value   |
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+------------------+
|C000001    |6           |221917     |2024-01-02         |2024-02-11        |4              |4                  |36986.166666666664|
|C000002    |5           |155424     |2024-01-03         |2024-02-12        |4              |4                  |31084.8           |
|C000003    |6           |254432     |2024-01-04         |2024-02-13        |5              |4                  |42405.333333333336|
|C000004    |6           |204420     |2024-01-05         |2024-02-14        |4              |3                  |34070.0           |
|C000005    |5           |214367     |2024-01-06         |2024-02-15 

Rank customers by total spending (overall).

In [29]:
from pyspark.sql.window import Window
city_rank_window = Window.orderBy(F.col("total_spend").desc())

ranked_customers = (
    customer_profile
    .withColumn("customer_rank", F.rank().over(city_rank_window))
)
ranked_customers.select("customer_id","total_spend","customer_rank").show()

+-----------+-----------+-------------+
|customer_id|total_spend|customer_rank|
+-----------+-----------+-------------+
|    C043076|     493949|            1|
|    C034689|     486879|            2|
|    C039985|     484057|            3|
|    C026691|     477147|            4|
|    C038979|     477138|            5|
|    C020762|     474717|            6|
|    C044654|     471304|            7|
|    C014292|     468617|            8|
|    C019565|     467523|            9|
|    C045487|     467050|           10|
|    C046747|     464951|           11|
|    C004490|     463923|           12|
|    C038296|     463147|           13|
|    C005286|     463098|           14|
|    C022754|     462987|           15|
|    C047145|     458602|           16|
|    C026261|     458067|           17|
|    C042247|     457936|           18|
|    C029753|     457258|           19|
|    C037602|     456932|           20|
+-----------+-----------+-------------+
only showing top 20 rows


Rank customers inside each city by total spending.

In [40]:

from pyspark.sql import functions as F
from pyspark.sql.window import Window


city_customer_spend_df = (
    clean_orders_df
    .groupBy("city", "customer_id")
    .agg(
        F.sum("amount").alias("total_spend_in_city"),
        F.count("order_id").alias("orders_in_city")
    )
)


w_city = Window.partitionBy("city").orderBy(
    F.col("total_spend_in_city").desc(),
    F.col("customer_id").asc()
)
ranked_in_city_df = (
    city_customer_spend_df
    .withColumn("rank_in_city_rownum", F.row_number().over(w_city))
)


top3_per_city_df = ranked_in_city_df.filter(F.col("rank_in_city_rownum") <= 3)




In [41]:
ranked_in_city_df.show(5)

+---------+-----------+-------------------+--------------+-------------------+
|     city|customer_id|total_spend_in_city|orders_in_city|rank_in_city_rownum|
+---------+-----------+-------------------+--------------+-------------------+
|Bangalore|    C011518|             332527|             5|                  1|
|Bangalore|    C024935|             315622|             4|                  2|
|Bangalore|    C025451|             303208|             4|                  3|
|Bangalore|    C008486|             300843|             5|                  4|
|Bangalore|    C039191|             294970|             4|                  5|
+---------+-----------+-------------------+--------------+-------------------+
only showing top 5 rows


In [42]:
top3_per_city_df.show()

+---------+-----------+-------------------+--------------+-------------------+
|     city|customer_id|total_spend_in_city|orders_in_city|rank_in_city_rownum|
+---------+-----------+-------------------+--------------+-------------------+
|Bangalore|    C011518|             332527|             5|                  1|
|Bangalore|    C024935|             315622|             4|                  2|
|Bangalore|    C025451|             303208|             4|                  3|
|  Chennai|    C028121|             340890|             5|                  1|
|  Chennai|    C027841|             287392|             5|                  2|
|  Chennai|    C030712|             284466|             4|                  3|
|    Delhi|    C016309|             325001|             5|                  1|
|    Delhi|    C022599|             314625|             4|                  2|
|    Delhi|    C018688|             306692|             4|                  3|
|Hyderabad|    C032833|             318097|         

In [46]:
ranked_customers.filter(F.col("customer_rank")<11).show()

+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+-----------------+-------------+
|customer_id|total_orders|total_spend|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|  avg_order_value|customer_rank|
+-----------+------------+-----------+-------------------+------------------+---------------+-------------------+-----------------+-------------+
|    C043076|           6|     493949|         2024-01-17|        2024-02-26|              5|                  4|82324.83333333333|            1|
|    C034689|           6|     486879|         2024-01-10|        2024-02-19|              4|                  3|          81146.5|            2|
|    C039985|           6|     484057|         2024-01-06|        2024-02-15|              3|                  4|80676.16666666667|            3|
|    C026691|           6|     477147|         2024-01-12|        2024-02-21|              4|                  3|          7

In [47]:

cust_loyalty_metrics_df = (
  clean_orders_df.groupBy("customer_id")
      .agg(
          F.countDistinct("order_date").alias("distinct_purchase_dates"),
          F.countDistinct("category").alias("distinct_categories"),
          F.sum("amount").alias("total_spend"),
          F.count("order_id").alias("total_orders")
      )
)
cust_loyalty_df = (
    cust_loyalty_metrics_df
    .withColumn(
        "loyal_flag",
        F.when(
            (F.col("distinct_purchase_dates") >= 3) & (F.col("distinct_categories") >= 2),
            F.lit(1)
        ).otherwise(F.lit(0))
    )
    .withColumn("loyal_label", F.when(F.col("loyal_flag") == 1, "Loyal").otherwise("Non-loyal"))
)
loyal_customers_df = cust_loyalty_df.filter(F.col("loyal_flag") == 1)


In [49]:
loyal_customers_df.count()

47140

In [50]:
customer_profile_sorted_df.count()

47500