# Retail Orders & Customer Transactions

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark=SparkSession.builder.appName("retail").getOrCreate()


In [2]:
raw_orders = [
    ("ORD001","C001","Ravi"," Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("ORD002","C002","Sneha","Mumbai"," Mobile ","Electronics","32000","05/01/2024","Completed"),
    ("ORD003","C003","Aman","Bangalore","Laptop","Electronics","55000","2024/01/06","Completed"),
    ("ORD004","C004","Pooja","Delhi","Tablet"," Electronics ","","2024-01-07","Cancelled"),
    ("ORD005","C005","Neha","Chennai","Laptop","Electronics","48000","invalid_date","Completed"),
    ("ORD006","C006","Rahul","Mumbai","Mobile","Electronics",None,"2024-01-08","Completed"),
    ("ORD007","C007","Kiran","Bangalore","Tablet","Electronics","30000","2024-01-08","Completed"),
    ("ORD008","C008","Amit","Delhi","Laptop","electronics","45000","2024-01-09","Completed"),
    ("ORD009","C009","Priya"," Pune","Mobile","Electronics","28000","09-01-2024","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),  # duplicate
    ("ORD011","C011","Meena","Chennai","Tablet","Electronics","31000","2024-01-11","Completed"),
    ("ORD012","C012","Arjun","Delhi","Mobile","Electronics","27000","2024/01/11","Completed"),
    ("ORD013","C013","Nikhil","Bangalore","Laptop","Electronics","60000","2024-01-12","Completed"),
    ("ORD014","C014","Rohit","Mumbai","Mobile","Electronics","invalid_price","2024-01-12","Completed"),
    ("ORD015","C015","Anita","Delhi","Tablet","Electronics","29000","2024-01-13","Completed"),
    ("ORD016","C016","Vikas","Chennai","Laptop","Electronics","52000","2024-01-13","Completed"),
    ("ORD017","C017","Sunita","Mumbai","Mobile","Electronics","33000","2024-01-14","Completed"),
    ("ORD018","C018","Deepak","Bangalore","Laptop","Electronics","58000","2024-01-14","Completed"),
    ("ORD019","C019","Pallavi","Delhi","Mobile","Electronics","26000","2024-01-15","Completed"),
    ("ORD020","C020","Manish","Mumbai","Tablet","Electronics","34000","2024-01-15","Completed")
]


In [3]:
columns=["order_id","customer_id","customer_name","city","product","category","price","order_date","order_status"]

In [60]:
df=spark.createDataFrame(data=raw_orders,schema=columns)
df.show()

+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|order_id|customer_id|customer_name|     city| product|     category|        price|  order_date|order_status|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|        45000|  2024-01-05|   Completed|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|        32000|  05/01/2024|   Completed|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|        55000|  2024/01/06|   Completed|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |             |  2024-01-07|   Cancelled|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Electronics|        48000|invalid_date|   Completed|
|  ORD006|       C006|        Rahul|   Mumbai|  Mobile|  Electronics|         NULL|  2024-01-08|   Completed|
|  ORD007|

# CLEANING & TRANSFORMATION TASKS

Column Operations

1. Rename all columns to snake_case

In [61]:
import re

def to_snake_case(name):
    name = re.sub(r'\s+', '_', name)
    name = re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
    return name

old_columns = df.columns

new_columns = [to_snake_case(col_name) for col_name in old_columns]

for old_name, new_name in zip(old_columns, new_columns):
    df = df.withColumnRenamed(old_name, new_name)

df.printSchema()
df.show(5)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_status: string (nullable = true)

+--------+-----------+-------------+---------+--------+-------------+-----+------------+------------+
|order_id|customer_id|customer_name|     city| product|     category|price|  order_date|order_status|
+--------+-----------+-------------+---------+--------+-------------+-----+------------+------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|45000|  2024-01-05|   Completed|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|32000|  05/01/2024|   Completed|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|55000|  2024/01/06|   Completed|
|  ORD004|      

2. Add a column price_with_tax (18%)

In [62]:
from pyspark.sql.functions import when

In [63]:
from pyspark.sql.functions import when, col
from pyspark.sql.types import DoubleType

df = df.withColumn("price",
                 when(col("price").rlike("^[0-9]+$"),col("price").cast(DoubleType())).otherwise(None))

if "price_with_tax" in df.columns:
    df = df.drop("price_with_tax")

df = df.withColumn("price_with_tax", col("price") * 1.18)

df.show(5)

+--------+-----------+-------------+---------+--------+-------------+-------+------------+------------+--------------+
|order_id|customer_id|customer_name|     city| product|     category|  price|  order_date|order_status|price_with_tax|
+--------+-----------+-------------+---------+--------+-------------+-------+------------+------------+--------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|45000.0|  2024-01-05|   Completed|       53100.0|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|32000.0|  05/01/2024|   Completed|       37760.0|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|55000.0|  2024/01/06|   Completed|       64900.0|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |   NULL|  2024-01-07|   Cancelled|          NULL|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Electronics|48000.0|invalid_date|   Completed|       56640.0|
+--------+-----------+-------------+---------+--

3. Add a column price_category (Low / Medium / High)

In [64]:
df=df.withColumn("price_category",
                 when(col("price")<30000,"Low")
                 .when(col("price").between(30000,50000),"Medium")
                 .otherwise("High")
                 )
df.show()

+--------+-----------+-------------+---------+--------+-------------+-------+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city| product|     category|  price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+--------+-------------+-------+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|45000.0|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|32000.0|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|55000.0|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |   NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Elect

# Data Cleaning

4. Trim and standardize city , product , category

In [65]:
from pyspark.sql.functions import trim,lower
df=df.withColumn("city",trim(lower(col("city")))).withColumn("product",trim(lower(col("product")))).withColumn("category",trim(lower(col("category"))))
df.show()


+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|  price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-------+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000.0|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000.0|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000.0|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    delhi| tablet|electronics|   NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000.0|invalid_d

5. Convert price to integer

In [66]:
from pyspark.sql.types import IntegerType

df = df.withColumn("price", col("price").cast(IntegerType()))
df.show(5)
df.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    delhi| tablet|electronics| NULL|  2024-01-07|   Cancelled|          NULL|          High|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000|invalid_date|   Completed

6. Handle invalid and null prices

In [67]:
from pyspark.sql.functions import mean, col, when
from pyspark.sql.types import IntegerType, DoubleType

mean_price = df.agg(mean(col("price"))).collect()[0][0]

df = df.withColumn("price", when(col("price").isNull(), mean_price).otherwise(col("price")))

df = df.withColumn("price", col("price").cast(IntegerType()))

df = df.withColumn("price_with_tax", col("price").cast(DoubleType()) * 1.18)

df = df.withColumn("price_category",
                 when(col("price") < 30000, "Low")
                 .when(col("price").between(30000, 50000), "Medium")
                 .otherwise("High"))

df.show(5)
df.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|  order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+------------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000|  2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000|  05/01/2024|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000|  2024/01/06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    delhi| tablet|electronics|41277|  2024-01-07|   Cancelled|      48706.86|        Medium|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000|invalid_date|   Completed

7. Normalize all dates into DateType

In [44]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType
from datetime import datetime

In [55]:
from pyspark.sql.functions import coalesce, to_date

In [69]:
from pyspark.sql.functions import udf, col, trim
from pyspark.sql.types import DateType
from datetime import datetime

def parse_date_py(date_str):
    if date_str is None:
        return None
    date_str = date_str.strip()
    formats = ["%Y-%m-%d", "%d-%m-%Y", "%m/%d/%Y", "%Y/%m/%d"]
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).date()
        except ValueError:
            continue
    return None

parse_date_udf = udf(parse_date_py, DateType())

df = df.withColumn("order_date", parse_date_udf(col("order_date")))

df.show(truncate=False)
df.printSchema()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|city     |product|category   |price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|ORD001  |C001       |Ravi         |delhi    |laptop |electronics|45000|2024-01-05|Completed   |53100.0       |Medium        |
|ORD002  |C002       |Sneha        |mumbai   |mobile |electronics|32000|2024-05-01|Completed   |37760.0       |Medium        |
|ORD003  |C003       |Aman         |bangalore|laptop |electronics|55000|2024-01-06|Completed   |64900.0       |High          |
|ORD004  |C004       |Pooja        |delhi    |tablet |electronics|41277|2024-01-07|Cancelled   |48706.86      |Medium        |
|ORD005  |C005       |Neha         |chennai  |laptop |electronics|48000|NULL      |Completed   |56640.0       |

8. Remove duplicate orders

In [70]:
df = df.dropDuplicates(["order_id"])
df.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000|2024-01-06|   Completed|       64900.0|          High|
|  ORD004|       C004|        Pooja|    delhi| tablet|electronics|41277|2024-01-07|   Cancelled|      48706.86|        Medium|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000|      NULL|   Completed|       56640.0|

9. Filter only Completed orders

In [72]:
df=df.filter(col("order_status")=="Completed")
df.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000|2024-01-06|   Completed|       64900.0|          High|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000|      NULL|   Completed|       56640.0|        Medium|
|  ORD006|       C006|        Rahul|   mumbai| mobile|electronics|41277|2024-01-08|   Completed|      48706.86|

# Data Transformation

10. Create order_year , order_month


In [74]:
from pyspark.sql.functions import year,month
df=df.withColumn("order_year",year(col("order_date"))).withColumn("order_month",month(col("order_date")))
df.show()

+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+
|order_id|customer_id|customer_name|     city|product|   category|price|order_date|order_status|price_with_tax|price_category|order_year|order_month|
+--------+-----------+-------------+---------+-------+-----------+-----+----------+------------+--------------+--------------+----------+-----------+
|  ORD001|       C001|         Ravi|    delhi| laptop|electronics|45000|2024-01-05|   Completed|       53100.0|        Medium|      2024|          1|
|  ORD002|       C002|        Sneha|   mumbai| mobile|electronics|32000|2024-05-01|   Completed|       37760.0|        Medium|      2024|          5|
|  ORD003|       C003|         Aman|bangalore| laptop|electronics|55000|2024-01-06|   Completed|       64900.0|          High|      2024|          1|
|  ORD005|       C005|         Neha|  chennai| laptop|electronics|48000|      NULL|   Completed|    

11. Aggregate total revenue per city


In [76]:
from pyspark.sql.functions import sum
revenue_city=df.groupBy("city").agg(sum("price").alias("total_revenue"))
revenue_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|  chennai|       131000|
|    delhi|       172000|
|bangalore|       203000|
|   mumbai|       236554|
|     pune|        28000|
+---------+-------------+



12. Aggregate total revenue per product

In [77]:
revenue_product=df.groupBy("product").agg(sum("price").alias("total_revenue"))
revenue_product.show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| mobile|       228554|
| tablet|       124000|
| laptop|       418000|
+-------+-------------+



13. Identify top 3 cities by revenue


In [78]:
top_cities=revenue_city.orderBy(col("total_revenue").desc()).limit(3)
top_cities.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|   mumbai|       236554|
|bangalore|       203000|
|    delhi|       172000|
+---------+-------------+



14. Identify products with average price above threshold

In [81]:
avg_price_threshold=40000

In [83]:
from pyspark.sql.functions import avg

premium_products=df.groupBy("product").agg(avg("price").alias("average_price")).filter(col("average_price")> avg_price_threshold)
premium_products=premium_products.orderBy(col("average_price").desc())
premium_products.show()

+-------+-------------+
|product|average_price|
+-------+-------------+
| laptop|      52250.0|
+-------+-------------+



# File Format Operations

15. Write cleaned data to Parquet

In [85]:
df=df.repartition(2)
df.write.parquet("cleaned_data.parquet")

16. Read Parquet back and verify schema

In [86]:
parquet_df=spark.read.parquet("cleaned_data.parquet")
parquet_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- order_status: string (nullable = true)
 |-- price_with_tax: double (nullable = true)
 |-- price_category: string (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)



17. Write the same data to ORC

In [87]:
df.write.mode("overwrite").orc("cleaned_data.orc")

# Performance & Validation

19. Check number of partitions

In [88]:
df.rdd.getNumPartitions()

2

20. Repartition before writing

In [89]:
df_repartitioned = df.repartition(4)
df_repartitioned.write.mode("overwrite").orc("cleaned_data_repartitioned.orc")
print(f"DataFrame repartitioned to {df_repartitioned.rdd.getNumPartitions()} partitions and written to 'cleaned_data_repartitioned.orc'")

DataFrame repartitioned to 4 partitions and written to 'cleaned_data_repartitioned.orc'


21. Compare file counts between Parquet and ORC

In [90]:
spark.read.parquet("cleaned_data.parquet").inputFiles()
spark.read.orc("cleaned_data.orc").inputFiles()

['file:///content/cleaned_data.orc/part-00001-675d9895-a51e-4810-b8c0-a8ccf2913e55-c000.zstd.orc',
 'file:///content/cleaned_data.orc/part-00000-675d9895-a51e-4810-b8c0-a8ccf2913e55-c000.zstd.orc']

22. Run explain(True) on final pipeline

In [91]:
df.explain(True)

== Parsed Logical Plan ==
Repartition 2, true
+- Project [order_id#1721, customer_id#1722, customer_name#1723, city#1826, product#1827, category#1828, price#1918, order_date#1957, order_status#1729, price_with_tax#1919, price_category#1920, order_year#2465, month(order_date#1957) AS order_month#2466]
   +- Project [order_id#1721, customer_id#1722, customer_name#1723, city#1826, product#1827, category#1828, price#1918, order_date#1957, order_status#1729, price_with_tax#1919, price_category#1920, year(order_date#1957) AS order_year#2465]
      +- Filter (order_status#1729 = Completed)
         +- Filter (order_status#1729 = Completed)
            +- Deduplicate [order_id#1721]
               +- Project [order_id#1721, customer_id#1722, customer_name#1723, city#1826, product#1827, category#1828, price#1918, parse_date_py(order_date#1728)#1956 AS order_date#1957, order_status#1729, price_with_tax#1919, price_category#1920]
                  +- Project [order_id#1721, customer_id#1722, cust