In [7]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading Complex Data Formats")
    .master("local[*]")
    .getOrCreate()
)

spark

In [14]:
df_paraquet = spark.read.format("parquet").load("data/input/*.parquet")
df_paraquet.show()
df_paraquet.printSchema()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|           Merc 280|19.2|  6|167.6|123|3.92| 3.44| 18.3|  1|  0|   4|   4|
|          M

In [20]:
df_orc = spark.read.format("orc").load("data/input/*.orc")
df_orc.show(2)
df_orc.printSchema()

+--------+-----+------+-----+-------------------+------+-------+----------------+-------+--------------------+--------------------+--------------------+-------------------+-------------------+
|boolean1|byte1|short1| int1|              long1|float1|double1|          bytes1|string1|              middle|                list|                 map|                 ts|           decimal1|
+--------+-----+------+-----+-------------------+------+-------+----------------+-------+--------------------+--------------------+--------------------+-------------------+-------------------+
|   false|    1|  1024|65536|9223372036854775807|   1.0|  -15.0|[00 01 02 03 04]|     hi|{[{1, bye}, {2, s...|[{3, good}, {4, b...|                  {}|2000-03-12 15:00:00|12345678.6547456000|
|    true|  100|  2048|65536|9223372036854775807|   2.0|   -5.0|              []|    bye|{[{1, bye}, {2, s...|[{100000000, cat}...|{chani -> {5, cha...|2000-03-12 15:00:01|12345678.6547457000|
+--------+-----+------+-----+------

In [27]:
# Benefits of Columnar Storage
import time

def get_time(func):
    def inner_get_time() -> str:
        start_time = time.time()
        func()
        end_time = time.time()
        return (f"Execution time: {(end_time - start_time)*1000} ms")
    print(inner_get_time())

@get_time
def x():
    df = spark.read.format("parquet").load("data/input/*.parquet")
    df.count()

Execution time: 181.51092529296875 ms


In [28]:
@get_time
def x():
    df = spark.read.format("parquet").load("data/input/*.parquet")
    df.select("disp").count()

Execution time: 172.96910285949707 ms


In [33]:
df_1 = spark.read.format("parquet").option("recursiveFileLookup", True).load("data/input/*.parquet")
df_1.show()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|          Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|      Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|   4|   2|
|           Merc 280|19.2|  6|167.6|123|3.92| 3.44| 18.3|  1|  0|   4|   4|
|          M