# Demo and Comparission of Big Data File Formats

## 1. CSV and JSON
Old dat formats that are not designed for big data and scaling  
**Typical feature:** humand readable

## 2. Avro, OCR, Parquet
First generation of special big data formats that allow fast writes, fast reads or both  
**Typical features:** splittable, compressible, data skipping and predicat pushdown, data schema inclueded



## 3. Delta, Iceberg, Hudi
Latest generation of big data format that support ACID transaction, audit save transaction logs and time travel  
**Typical features:** enhancing first generation format with additonal meta data and read/write procedures.

In [1]:
#################################################################################
# Laden aller relevate Module
#################################################################################
import pyspark
from pyspark.sql.functions import *
import json
import csv
from datetime import datetime
from delta import *
import delta

# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [2]:
# first for local usage pip install delta-spark

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars", "/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.driver.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.executor.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")


spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark

## Create sample data

In [5]:
account_data1 = [
    (1,"alex","2019-01-01",1000),
    (2,"alex","2019-02-01",1500),
    (3,"alex","2019-03-01",1700),
    (4,"maria","2020-01-01",5000)
    ]

account_data2 = [
    (1,"alex","2019-03-01",3300),
    (2,"peter","2021-01-01",100)
    ]

account_data3 = [
    (1,"otto","2019-10-01",4444,"neue Spalte 1")
]

schema = ["id","account","dt_transaction","balance"]
schema3 = ["id","account","dt_transaction","balance","new"]

df1 = spark.createDataFrame(data=account_data1, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df2 = spark.createDataFrame(data=account_data2, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df3 = spark.createDataFrame(data=account_data3, schema = schema3).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)

print("++ create new dataframe and show schema and data")
print("################################################")

df1.printSchema()
df1.show(truncate=False)
df3.show(truncate=False)

++ create new dataframe and show schema and data
################################################
root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
|id |account|dt_transaction|balance|
+---+-------+--------------+-------+
|2  |alex   |2019-02-01    |1500   |
|3  |alex   |2019-03-01    |1700   |
|1  |alex   |2019-01-01    |1000   |
|4  |maria  |2020-01-01    |5000   |
+---+-------+--------------+-------+

+---+-------+--------------+-------+-------------+
|id |account|dt_transaction|balance|new          |
+---+-------+--------------+-------+-------------+
|1  |otto   |2019-10-01    |4444   |neue Spalte 1|
+---+-------+--------------+-------+-------------+



## CSV

In [6]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_csv=(df1
           .write
           .format("csv")
           .mode("overwrite") # append
           .save("output/csv")
          )


Number of Partitions: 3


In [7]:
!ls output/csv/

_SUCCESS
part-00000-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv
part-00001-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv
part-00002-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv


In [8]:
! cat output/csv/part-00000-9414046c-9453-4865-861e-120d6f5b7468-c000.csv

cat: output/csv/part-00000-9414046c-9453-4865-861e-120d6f5b7468-c000.csv: No such file or directory


In [9]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



In [10]:
write_csv=(df3
           .write
           .format("csv")
           .mode("append") # append
           .save("output/csv")
          )

In [11]:
!ls output/csv/

_SUCCESS
part-00000-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv
part-00000-9575e102-7c59-484a-b228-3ff0c218e96e-c000.csv
part-00001-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv
part-00002-5d0e1e96-96dd-4ba5-987f-360613bc7cb6-c000.csv
part-00002-9575e102-7c59-484a-b228-3ff0c218e96e-c000.csv


In [12]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  1| otto|2019-10-01|4444|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



* kein Schema (Typen)
* kein anfügen neuer Spalten

## JSON

In [13]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_json=(df1
           .write
           .format("json")
           .mode("overwrite") # append
           .save("output/json")
          )


Number of Partitions: 3


In [14]:
!ls output/json/

_SUCCESS
part-00000-99215b3a-5830-466d-b6d4-2bb6067e9b13-c000.json
part-00001-99215b3a-5830-466d-b6d4-2bb6067e9b13-c000.json
part-00002-99215b3a-5830-466d-b6d4-2bb6067e9b13-c000.json


In [15]:
! cat output/json/part-00000-9ccd28b9-1b94-4edb-bb20-2b600c5e56d2-c000.json

cat: output/json/part-00000-9ccd28b9-1b94-4edb-bb20-2b600c5e56d2-c000.json: No such file or directory


In [16]:
write_csv=(df3
           .write
           .format("json")
           .mode("append") # append
           .save("output/json")
          )

In [17]:
read_json=spark.read.format("json").load("output/json")

read_json.printSchema()
read_json.show()

root
 |-- account: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- dt_transaction: string (nullable = true)
 |-- id: long (nullable = true)
 |-- new: string (nullable = true)

+-------+-------+--------------+---+-------------+
|account|balance|dt_transaction| id|          new|
+-------+-------+--------------+---+-------------+
|   alex|   1500|    2019-02-01|  2|         null|
|   alex|   1700|    2019-03-01|  3|         null|
|   otto|   4444|    2019-10-01|  1|neue Spalte 1|
|  maria|   5000|    2020-01-01|  4|         null|
|   alex|   1000|    2019-01-01|  1|         null|
+-------+-------+--------------+---+-------------+



* Kein Schema
* Neue Spalten werden als neues Attribut hinzugefügt

## Avro

In [30]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_avro=(df1
           .write
           .format("avro")
           .mode("overwrite") # append
           .save("output/avro")
          )

Number of Partitions: 3


In [31]:
!ls output/avro/

_SUCCESS
part-00000-e96ab2a8-02b2-4c6e-9060-cf1a60854dcd-c000.avro
part-00001-e96ab2a8-02b2-4c6e-9060-cf1a60854dcd-c000.avro
part-00002-e96ab2a8-02b2-4c6e-9060-cf1a60854dcd-c000.avro


In [32]:
! cat output/avro/part-00000-b6598991-fdff-423a-a94e-bd13548803e3-c000.avro

cat: output/avro/part-00000-b6598991-fdff-423a-a94e-bd13548803e3-c000.avro: No such file or directory


In [33]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)



In [34]:
write_avro=(df3
           .write
           .format("avro")
           .mode("append") # append
           .save("output/avro")
          )

In [35]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()
read_json.show()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)
 |-- new: string (nullable = true)

+---+-------+--------------+-------+-------------+
| id|account|dt_transaction|balance|          new|
+---+-------+--------------+-------+-------------+
|  1|   otto|    2019-10-01|   4444|neue Spalte 1|
|  2|   alex|    2019-02-01|   1500|         null|
|  3|   alex|    2019-03-01|   1700|         null|
|  4|  maria|    2020-01-01|   5000|         null|
|  1|   alex|    2019-01-01|   1000|         null|
+---+-------+--------------+-------+-------------+



* Schema erhalten
* Schema evolution 

## Parquet

In [36]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_parquet=(df1
           .write
           .partitionBy("account")
           .format("parquet")
           .mode("overwrite") # append
           .save("output/parquet")
          )


Number of Partitions: 3


In [37]:
!ls  output/parquet/
!echo "----------------------------"
!ls output/parquet/account=alex

_SUCCESS      [34maccount=alex[m[m  [34maccount=maria[m[m
----------------------------
part-00000-a1225adc-0d32-4d93-af7c-26ae1bbd0f85.c000.snappy.parquet
part-00001-a1225adc-0d32-4d93-af7c-26ae1bbd0f85.c000.snappy.parquet


In [39]:
! cat output/parquet/account=alex/part-00001-a1225adc-0d32-4d93-af7c-26ae1bbd0f85.c000.snappy.parquet

PAR1  ,                (                 4           , �E  �E   (�E  �E     
$   �E    , �      �       (�      �         4   �      LHspark_schema %id %dt_transaction% %balance <&5 id��&<               (                    &�5 dt_transactionnr&�<�E  �E   (�E  �E        &�5 balance��&�<�      �       (�      �            � ,org.apache.spark.version3.1.2 )org.apache.spark.sql.parquet.row.metadata�{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"dt_transaction","type":"date","nullable":true,"metadata":{}},{"name":"balance","type":"long","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)<       �  PAR1

## Delta

In [56]:
write_delta=(df1
           .write
           .format("delta")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [48]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
! cat output/delta/_delta_log/00000000000000000000.json

[34m_delta_log[m[m
part-00000-77b4d015-7026-45bf-9dcd-b32568c95027-c000.snappy.parquet
part-00001-9179f41e-35b6-4c46-8ea7-e0c799253d81-c000.snappy.parquet
part-00002-c9ae5670-0f60-4884-9a9c-042ff1b850c5-c000.snappy.parquet
-------------------------------------------------------------------
00000000000000000000.json
{"commitInfo":{"timestamp":1670096258965,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isBlindAppend":false,"operationMetrics":{"numFiles":"3","numOutputBytes":"3527","numOutputRows":"4"}}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"27f09a67-4291-473a-ab45-c0e74212b17e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"account\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dt_transaction\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bala

In [65]:
write_delta=(df2
           .write
           .format("delta")
           .mode("append") # append
           .save("output/delta")
          )

write_delta=(df3
           .write
           .format("delta")
           .option("mergeSchema", "true")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [62]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
#! cat output/delta/_delta_log/00000000000000000005.json

[34m_delta_log[m[m
part-00000-0bcd01b3-9af1-486f-b943-df7dc734dee8-c000.snappy.parquet
part-00000-7753aa63-d823-4a40-a695-6ba62b86f124-c000.snappy.parquet
part-00000-77b4d015-7026-45bf-9dcd-b32568c95027-c000.snappy.parquet
part-00000-843ff904-14e8-4e05-9e03-eabdd6bf5263-c000.snappy.parquet
part-00000-ee0edf8a-0ca4-4953-8988-bd48d65a5ef3-c000.snappy.parquet
part-00000-f7177a2d-53b2-456a-97da-e2d3093f01e8-c000.snappy.parquet
part-00001-6bce0058-02b2-4a78-a34f-68a24ce5c0b2-c000.snappy.parquet
part-00001-9179f41e-35b6-4c46-8ea7-e0c799253d81-c000.snappy.parquet
part-00002-181cb71b-f5de-46eb-ac57-4838617ad4b4-c000.snappy.parquet
part-00002-19d3d12a-be62-45aa-a46d-0752c585f3da-c000.snappy.parquet
part-00002-2e758d82-3005-46f2-8004-f5d34e52f70f-c000.snappy.parquet
part-00002-66584e15-fff8-4ce7-98f0-c0c2f0d31de5-c000.snappy.parquet
part-00002-8ba77af9-f44a-42b2-8ab1-0f0c3047957c-c000.snappy.parquet
part-00002-c9ae5670-0f60-4884-9a9c-042ff1b850c5-c000.snappy.parquet
--------------------------

In [66]:
deltaTable = DeltaTable.forPath(spark, "output/delta")

fullHistoryDF = deltaTable.history()    # get the full history of the table

In [67]:
fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics","userMetadata").show(truncate=True)

+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|version|readVersion|          timestamp|userId|operation| operationParameters|    operationMetrics|userMetadata|
+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|      7|          6|2022-12-03 20:47:09|  null|    WRITE|{mode -> Overwrit...|{numFiles -> 2, n...|        null|
|      6|          5|2022-12-03 20:47:06|  null|    WRITE|{mode -> Append, ...|{numFiles -> 2, n...|        null|
|      5|          4|2022-12-03 20:45:58|  null|    WRITE|{mode -> Append, ...|{numFiles -> 2, n...|        null|
|      4|          3|2022-12-03 20:45:56|  null|    WRITE|{mode -> Append, ...|{numFiles -> 2, n...|        null|
|      3|          2|2022-12-03 20:45:10|  null|    WRITE|{mode -> Append, ...|{numFiles -> 2, n...|        null|
|      2|          1|2022-12-03 20:44:59|  null|    WRITE|{mode -> Overwrit...|{numFiles

In [54]:
spark.read.format("delta").option("versionAsOf", 0).load("output/delta").show()


+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  4|  maria|    2020-01-01|   5000|
|  1|   alex|    2019-01-01|   1000|
+---+-------+--------------+-------+



In [69]:
spark.read.format("delta").option("versionAsOf", 6).load("output/delta").show()


+---+-------+--------------+-------+-------------+
| id|account|dt_transaction|balance|          new|
+---+-------+--------------+-------+-------------+
|  1|   otto|    2019-10-01|   4444|neue Spalte 1|
|  2|   alex|    2019-02-01|   1500|         null|
|  3|   alex|    2019-03-01|   1700|         null|
|  2|  peter|    2021-01-01|    100|         null|
|  2|  peter|    2021-01-01|    100|         null|
|  4|  maria|    2020-01-01|   5000|         null|
|  2|  peter|    2021-01-01|    100|         null|
|  1|   alex|    2019-01-01|   1000|         null|
|  1|   alex|    2019-03-01|   3300|         null|
|  1|   alex|    2019-03-01|   3300|         null|
|  1|   alex|    2019-03-01|   3300|         null|
+---+-------+--------------+-------+-------------+



* Schema
* Schema evolution
* Transaction Log
* Time Travel

In [None]:
spark.stop()