In [0]:
#Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Reading Complex Data Formats")
        .getOrCreate()
)

In [0]:
spark

In [0]:
# Read parquet sales data
df_parquet = spark.read.parquet("/data/input/*.parquet") # '*' to read all file in the directry
df_parquet.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- sale_date: date (nullable = true)



In [0]:
df_parquet.show()

+-------+------------+--------+------+----------+
|sale_id|product_name|quantity| price| sale_date|
+-------+------------+--------+------+----------+
|      4|   Product D|      15| 49.99|2024-12-31|
|      5|   Product E|       8| 89.99|2025-02-14|
|      9|   Product I|      14|199.99|2025-08-30|
|     10|   Product J|       6| 79.99|2024-04-18|
|      1|   Product A|      10| 99.99|2024-01-15|
|      2|   Product B|       5|149.99|2024-06-20|
|      3|   Product C|      20| 19.99|2024-09-05|
|      6|   Product F|      12|129.99|2025-07-22|
|      7|   Product G|       7| 59.99|2025-03-10|
|      8|   Product H|       9| 39.99|2024-11-11|
+-------+------------+--------+------+----------+



In [0]:
# Read ORC sales data
df_orc = spark.read.orc("/data/input/sales_data1.orc")
df_orc.printSchema()

root
 |-- sale_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- sale_date: date (nullable = true)



In [0]:
# Benefits of Columnar Storage
# Creating A Simple Python Decorator = {get time} for excecution timings
import time
def getTime(fxn):
    def innerGetTime() -> str:
        start_time = time.time()
        fxn()
        end_time = time.time()
        return (f"Execution time: {(end_time-start_time)}")
    print (innerGetTime())

In [0]:
@getTime
def x():
    df = spark.read.parquet("/data/input/*.parquet")
    df.count()

Execution time: 1.4046027660369873


In [0]:
@getTime
def x():
    df = spark.read.parquet("/data/input/*.parquet")
    df.select("sale_id").count()

    # Since parquet is a COLUMNAR data format, it take less time to excute if we read only the required COLUMNS

Execution time: 1.2969348430633545


In [0]:
BONUS TIP
RECURSIVE read using 'recuriveFileLookup'

sales_recursive/
└── sales1/1.parquet
└── sales1/sales2/2.parquet


In [0]:
spark.read.parquet("/data/input/sales_recursive/sales1/1.parquet").show()
spark.read.parquet("/data/input/sales_recursive/sales1/sales2/2.parquet").show()

+-------+--------------+--------+-------+----------+
|sale_id|  product_name|quantity|  price| sale_date|
+-------+--------------+--------+-------+----------+
|     15|Samsung Z-Fold|       5|98000.5|2025-05-15|
+-------+--------------+--------+-------+----------+

+-------+------------+--------+-------+----------+
|sale_id|product_name|quantity|  price| sale_date|
+-------+------------+--------+-------+----------+
|     36|Google Pixel|      10|83000.9|2025-06-28|
+-------+------------+--------+-------+----------+



- 'recursiveFileLookup' instructs Spark to look for the common folder and read the files recursively
- It will collectively read both the files

In [0]:
spark.read.option("recursiveFileLookup", True).parquet("/data/input/sales_recursive/").show()

+-------+--------------+--------+-------+----------+
|sale_id|  product_name|quantity|  price| sale_date|
+-------+--------------+--------+-------+----------+
|     15|Samsung Z-Fold|       5|98000.5|2025-05-15|
|     36|  Google Pixel|      10|83000.9|2025-06-28|
+-------+--------------+--------+-------+----------+

