In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('sales data')\
.getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
raw_sales = [
    ("TXN001", "Delhi", "Laptop", "Electronics", 45000, "2024-01-05", "Complete"),
    ("TXN002", "Mumbai", "Mobile", "Electronics", 32000, "2024-01-05", "Complete"),
    ("TXN003", "Bangalore", "Tablet", "Electronics", 30000, "2024-01-06", "Complete"),
    ("TXN004", "Delhi", "Laptop", "Electronics", 0, "2024-01-07", "Cancelled"),
    ("TXN005", "Chennai", "Mobile", "Electronics", 0, "2024-01-08", "Complete"),
    ("TXN006", "Mumbai", "Tablet", "Electronics", 0, "2024-01-08", "Complete"),
    ("TXN007", "Delhi", "Laptop", "Electronics", 45000, "2024-01-09", "Complete"),
    ("TXN008", "Bangalore", "Mobile", "Electronics", 28000, "2024-01-09", "Complete"),
    ("TXN009", "Mumbai", "Laptop", "Electronics", 55000, "2024-01-10", "Complete")
]

columns = ["transaction_id", "city", "product", "category", "amount", "transaction_date", "status"]

In [4]:
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product", StringType(), True),
    StructField("category", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("transaction_date", StringType(), True),
    StructField("status", StringType(), True)
])
raw_df = spark.createDataFrame(data=raw_sales, schema=schema)
raw_df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|Electronics| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|Electronics| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|Electronics| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|Electronics|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|Electronics|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|Electronics|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|Electronics| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|Electronics| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|Electronics| 55000|      2024-01-10| Complete|
+--------------+---------+--

#PART 1 — DATA CLEANING & TRANSFORMATION

1. Trim and standardize all string columns

In [5]:
df = raw_df.select([trim(col(c)).alias(c) for c in raw_df.columns])
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|Electronics| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|Electronics| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|Electronics| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|Electronics|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|Electronics|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|Electronics|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|Electronics| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|Electronics| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|Electronics| 55000|      2024-01-10| Complete|
+--------------+---------+--

2. Convert category to uppercase

In [6]:
df = df.withColumn("category", upper(col("category")))
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10| Complete|
+--------------+---------+--

3. Convert amount to integer

In [7]:
df = df.withColumn("amount", col("amount").cast(IntegerType()))
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10| Complete|
+--------------+---------+--

4. Handle invalid, empty, and null amount values

In [8]:
df = df.withColumn("amount", when(col("amount").isNull(), 0).otherwise(col("amount")))
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10| Complete|
+--------------+---------+--

5. Convert transaction_date into DateType

In [9]:
df = df.withColumn("transaction_date",
                   coalesce(to_date(col("transaction_date"), "yyyy-MM-dd"),
                            to_date(col("transaction_date"), "dd-MM-yyyy"),
                            to_date(col("transaction_date"), "MM-dd-yyyy")
                            ))
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10| Complete|
+--------------+---------+--

6. Remove duplicate transactions

In [10]:
df = df.dropDuplicates(["transaction_id"])
df.show()

+--------------+---------+-------+-----------+------+----------------+---------+
|transaction_id|     city|product|   category|amount|transaction_date|   status|
+--------------+---------+-------+-----------+------+----------------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05| Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05| Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06| Complete|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|      2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08| Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09| Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09| Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10| Complete|
+--------------+---------+--

7. Keep only Completed transactions

In [11]:
df = df.filter(col("status") == "Complete")
df.show()

+--------------+---------+-------+-----------+------+----------------+--------+
|transaction_id|     city|product|   category|amount|transaction_date|  status|
+--------------+---------+-------+-----------+------+----------------+--------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05|Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05|Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06|Complete|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08|Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08|Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09|Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09|Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10|Complete|
+--------------+---------+-------+-----------+------+----------------+--------+



8. Rename all columns to snake_case

In [12]:
import re

def to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\\1_\\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\\1_\\2', s1).lower()

original_columns = df.columns
for col_name in original_columns:
    df = df.withColumnRenamed(col_name, to_snake_case(col_name))

df.show()

+--------------+---------+-------+-----------+------+----------------+--------+
|transaction_id|     city|product|   category|amount|transaction_date|  status|
+--------------+---------+-------+-----------+------+----------------+--------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05|Complete|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05|Complete|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06|Complete|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08|Complete|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08|Complete|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09|Complete|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09|Complete|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10|Complete|
+--------------+---------+-------+-----------+------+----------------+--------+



#PART 2 — COLUMN OPERATIONS

9. Add a column tax_amount (18% of amount)

In [13]:
df = df.withColumn("tax_amount", col("amount")*0.18)
df.show()

+--------------+---------+-------+-----------+------+----------------+--------+----------+
|transaction_id|     city|product|   category|amount|transaction_date|  status|tax_amount|
+--------------+---------+-------+-----------+------+----------------+--------+----------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05|Complete|    8100.0|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05|Complete|    5760.0|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06|Complete|    5400.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09|Complete|    8100.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-01-09|Complete|    5040.0|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|      2024-01-10|Complete|    9900.0|

10. Add a column total_amount (amount + tax)

In [14]:
df = df.withColumn("total_amount", col("amount") + col("tax_amount"))
df.show()

+--------------+---------+-------+-----------+------+----------------+--------+----------+------------+
|transaction_id|     city|product|   category|amount|transaction_date|  status|tax_amount|total_amount|
+--------------+---------+-------+-----------+------+----------------+--------+----------+------------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05|Complete|    8100.0|     53100.0|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05|Complete|    5760.0|     37760.0|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06|Complete|    5400.0|     35400.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|         0.0|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|         0.0|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09|Complete|    8100.0|     53100.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-

11. Replace city names with standardized values

In [16]:
from pyspark.sql.functions import when, col
df = df.withColumn("city",
                    when(col("city") == "Delhi", "Delhi")
                   .when(col("city") == "Mumbai", "Mumbai")
                   .when(col("city") == "Bangalore", "Bangalore")
                   .when(col("city") == "Chennai", "Chennai")
                   # Add more standardization rules here if needed
                   .otherwise(col("city"))) # Keep original if no match

df.show()

+--------------+---------+-------+-----------+------+----------------+--------+----------+------------+
|transaction_id|     city|product|   category|amount|transaction_date|  status|tax_amount|total_amount|
+--------------+---------+-------+-----------+------+----------------+--------+----------+------------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-05|Complete|    8100.0|     53100.0|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|      2024-01-05|Complete|    5760.0|     37760.0|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|      2024-01-06|Complete|    5400.0|     35400.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|         0.0|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|      2024-01-08|Complete|       0.0|         0.0|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      2024-01-09|Complete|    8100.0|     53100.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|      2024-

12. Rename transaction_date to order_date

In [17]:
df = df.withColumnRenamed("transaction_date", "order_date")
df.show()

+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|transaction_id|     city|product|   category|amount|order_date|  status|tax_amount|total_amount|
+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Complete|    8100.0|     53100.0|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|2024-01-05|Complete|    5760.0|     37760.0|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Complete|    5400.0|     35400.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Complete|       0.0|         0.0|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|2024-01-08|Complete|       0.0|         0.0|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-09|Complete|    8100.0|     53100.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Complete|    5040.0|     33040.0|
|        TXN009|   M

#PART 3 — ANALYTICS TRANSFORMATIONS

13. Total revenue per city

In [18]:
df.groupBy("city").agg(sum("amount").alias("total_revenue")).show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|        58000|
|  Chennai|            0|
|   Mumbai|        87000|
|    Delhi|        90000|
+---------+-------------+



14. Total revenue per product

In [19]:
df.groupBy("product").agg(sum("amount").alias("total_revenue")).show()

+-------+-------------+
|product|total_revenue|
+-------+-------------+
| Laptop|       145000|
| Mobile|        60000|
| Tablet|        30000|
+-------+-------------+



15. Average order value per city

In [20]:
df.groupBy("city").agg(avg("amount").alias("avg_order_value")).show()

+---------+---------------+
|     city|avg_order_value|
+---------+---------------+
|Bangalore|        29000.0|
|  Chennai|            0.0|
|   Mumbai|        29000.0|
|    Delhi|        45000.0|
+---------+---------------+



16. Top 3 cities by revenue

In [21]:
df.groupBy("city").agg(sum("amount").alias("total_revenue")).orderBy(col("total_revenue").desc()).limit(3).show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|    Delhi|        90000|
|   Mumbai|        87000|
|Bangalore|        58000|
+---------+-------------+



17. Identify products with average amount > 40,000

In [22]:
df.groupBy("product").agg(avg("amount").alias("avg_amount")).filter(col("avg_amount") > 40000).show()

+-------+------------------+
|product|        avg_amount|
+-------+------------------+
| Laptop|48333.333333333336|
+-------+------------------+



#PART 4 — PARTITIONS & PERFORMANCE

18. Check current number of partitions

In [23]:
df.rdd.getNumPartitions()

1

19. Repartition data by city

In [24]:
df = df.repartition("city")
df.rdd.getNumPartitions()

1

20. Explain why repartitioning is needed

In [26]:
# Repartitioning is crucial in Apache Spark for several reasons, primarily related to performance optimization
# and efficient resource utilization in a distributed computing environment:
#
# 1.  **Optimize Shuffle Operations and Prevent Data Skew**: When performing wide transformations like `groupBy`,
#     `join`, or `orderBy`, Spark shuffles data across the network to group it by keys. If data is unevenly
#     distributed across partitions (data skew), some partitions will have significantly more data than others.
#     This leads to a few tasks taking a much longer time to complete, becoming bottlenecks and slowing down
#     the entire Spark job. Repartitioning can redistribute the data more evenly across the cluster, mitigating
#     data skew and improving the performance of shuffle operations.
#
# 2.  **Improve Parallelism**: If the number of partitions is too low, Spark might not be able to fully utilize
#     all available CPU cores and executors in the cluster. Increasing the number of partitions (within reasonable
#     limits) allows more tasks to run in parallel, thereby speeding up computations.
#
# 3.  **Reduce Task Overhead for Small Partitions**: Conversely, if there are too many very small partitions,
#     the overhead of managing and scheduling a large number of tiny tasks can outweigh the benefits of parallelism.
#     Each task incurs a certain overhead, so having an optimal number of partitions, where each partition is a
#     manageable size (e.g., 128MB-256MB), is important.
#
# 4.  **Achieve Data Locality**: Repartitioning by a specific key that will be used in subsequent operations (like
#     a `join` or `filter`) can help ensure that all data related to that key resides on the same physical node.
#     This reduces network I/O, as Spark can perform operations on locally available data instead of fetching
#     it from other nodes.
#
# In summary, while repartitioning involves a costly shuffle operation itself, it is often a necessary step
# to improve the overall performance of subsequent transformations, especially on large datasets where data
# distribution and parallelism are critical factors.

21. Use explain(True) and observe the plan

In [25]:
df.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- Project [transaction_id#314, city#667, product#316, category#317, amount#318, transaction_date#319 AS order_date#756, status#320, tax_amount#403, total_amount#489]
   +- Project [transaction_id#314, CASE WHEN (city#578 = Delhi) THEN Delhi WHEN (city#578 = Mumbai) THEN Mumbai WHEN (city#578 = Bangalore) THEN Bangalore WHEN (city#578 = Chennai) THEN Chennai ELSE city#578 END AS city#667, product#316, category#317, amount#318, transaction_date#319, status#320, tax_amount#403, total_amount#489]
      +- Project [transaction_id#314, CASE WHEN (city#315 = Delhi) THEN Delhi WHEN (city#315 = Mumbai) THEN Mumbai WHEN (city#315 = Bangalore) THEN Bangalore WHEN (city#315 = Chennai) THEN Chennai ELSE city#315 END AS city#578, product#316, category#317, amount#318, transaction_date#319, status#320, tax_amount#403, total_amount#489]
         +- Project [transaction_id#314, city#315, product#316, category#317, amount#318, transaction_date#

#PART 5 — FILE FORMAT STORAGE (HANDS-ON)

22. Write cleaned data to Parquet

data/parquet/sales

In [27]:
df.write.mode("overwrite").parquet("data/parquet/sales")

23. Read Parquet back and validate schema

In [28]:
parquet_df = spark.read.parquet("data/parquet/sales")
parquet_df.printSchema()
parquet_df.show()

root
 |-- transaction_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)
 |-- tax_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)

+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|transaction_id|     city|product|   category|amount|order_date|  status|tax_amount|total_amount|
+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Complete|    5400.0|     35400.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Complete|    5040.0|     33040.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Complete|       0.0|         0.0|
|        TXN002|   Mumbai| Mobile|ELECTRON

24. Write the same data to ORC

data/orc/sales

In [29]:
df.write.mode("overwrite").orc("data/orc/sales")

In [31]:
df_orc = spark.read.orc("data/orc/sales")
df_orc.show()

+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|transaction_id|     city|product|   category|amount|order_date|  status|tax_amount|total_amount|
+--------------+---------+-------+-----------+------+----------+--------+----------+------------+
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Complete|    5400.0|     35400.0|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Complete|    5040.0|     33040.0|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Complete|       0.0|         0.0|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|2024-01-05|Complete|    5760.0|     37760.0|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|2024-01-08|Complete|       0.0|         0.0|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Complete|    9900.0|     64900.0|
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Complete|    8100.0|     53100.0|
|        TXN007|    

25. Compare file sizes and number of output files

In [30]:
import os

# Function to get directory size and file count
def get_directory_info(path):
    total_size = 0
    file_count = 0
    if os.path.exists(path):
        for dirpath, dirnames, filenames in os.walk(path):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                # Skip metadata files if they exist
                if not f.startswith('.') and not f.startswith('_'):
                    total_size += os.path.getsize(fp)
                    file_count += 1
    return total_size, file_count

parquet_path = "data/parquet/sales"
orc_path = "data/orc/sales"

parquet_size, parquet_files = get_directory_info(parquet_path)
orc_size, orc_files = get_directory_info(orc_path)

print(f"Parquet directory: {parquet_path}")
print(f"  Total size: {parquet_size / (1024*1024):.2f} MB")
print(f"  Number of data files: {parquet_files}")

print(f"\nORC directory: {orc_path}")
print(f"  Total size: {orc_size / (1024*1024):.2f} MB")
print(f"  Number of data files: {orc_files}")


Parquet directory: data/parquet/sales
  Total size: 0.00 MB
  Number of data files: 1

ORC directory: data/orc/sales
  Total size: 0.00 MB
  Number of data files: 1
