# Demo and Comparission of Big Data File Formats
This Notebook runs only on a local Spark Environment, not on Kubernetes

## 1. CSV and JSON
Old dat formats that are not designed for big data and scaling  
**Typical feature:** humand readable

## 2. Avro, OCR, Parquet
First generation of special big data formats that allow fast writes, fast reads or both  
**Typical features:** splittable, compressible, data skipping and predicat pushdown, data schema inclueded



## 3. Delta, Iceberg, Hudi
Latest generation of big data format that support ACID transaction, audit save transaction logs and time travel  
**Typical features:** enhancing first generation format with additonal meta data and read/write procedures.

In [1]:
#################################################################################
# Laden aller relevate Module
#################################################################################
import pyspark
from pyspark.sql.functions import *
import json
import csv
from datetime import datetime
from delta import *
import delta

# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [2]:
# first for local usage pip install delta-spark

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars", "/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.driver.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.executor.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")


spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
spark

## Create sample data

In [3]:
# initial daten
account_data1 = [
    (1,"alex","2019-01-01",1000),
    (2,"alex","2019-02-01",1500),
    (3,"alex","2019-03-01",1700),
    (4,"maria","2020-01-01",5000)
    ]

# update mit Änderung und neuem Datensat
account_data2 = [
    (1,"alex","2019-03-01",3300),
    (2,"peter","2021-01-01",100)
    ]

# Update mit neuer Spalte
account_data3 = [
    (1,"otto","2019-10-01",4444,"neue Spalte 1")
]

schema = ["id","account","dt_transaction","balance"]
schema3 = ["id","account","dt_transaction","balance","new"]

df1 = spark.createDataFrame(data=account_data1, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df2 = spark.createDataFrame(data=account_data2, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df3 = spark.createDataFrame(data=account_data3, schema = schema3).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)

print("++ create new dataframe and show schema and data")
print("################################################")

df1.printSchema()
df1.show(truncate=False)
df3.show(truncate=False)

++ create new dataframe and show schema and data
################################################
root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
|id |account|dt_transaction|balance|
+---+-------+--------------+-------+
|2  |alex   |2019-02-01    |1500   |
|3  |alex   |2019-03-01    |1700   |
|1  |alex   |2019-01-01    |1000   |
|4  |maria  |2020-01-01    |5000   |
+---+-------+--------------+-------+

+---+-------+--------------+-------+-------------+
|id |account|dt_transaction|balance|new          |
+---+-------+--------------+-------+-------------+
|1  |otto   |2019-10-01    |4444   |neue Spalte 1|
+---+-------+--------------+-------+-------------+



## CSV

In [4]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_csv=(df1
           .write
           .format("csv")
           .mode("overwrite") # append
           .save("output/csv")
          )


Number of Partitions: 3


In [5]:
!ls output/csv/

_SUCCESS
part-00000-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv
part-00001-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv
part-00002-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv


In [6]:
! cat output/csv/part-00000-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv

2,alex,2019-02-01,1500
3,alex,2019-03-01,1700


In [7]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



In [8]:
write_csv=(df3
           .write
           .format("csv")
           .mode("append") # append
           .save("output/csv")
          )

In [9]:
!ls output/csv/

_SUCCESS
part-00000-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv
part-00000-55792bd8-13d5-468e-93ce-4b9dc132e3c3-c000.csv
part-00001-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv
part-00002-4f004dc2-afa5-4612-b20a-e077538cd467-c000.csv
part-00002-55792bd8-13d5-468e-93ce-4b9dc132e3c3-c000.csv


In [11]:
!cat output/csv/part-00002-55792bd8-13d5-468e-93ce-4b9dc132e3c3-c000.csv

1,otto,2019-10-01,4444,neue Spalte 1


In [12]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  1| otto|2019-10-01|4444|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



* kein Schema (Typen)
* kein anfügen neuer Spalten

## JSON

In [13]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_json=(df1
           .write
           .format("json")
           .mode("overwrite") # append
           .save("output/json")
          )


Number of Partitions: 3


In [14]:
!ls output/json/

_SUCCESS
part-00000-ff1a03b2-b55f-4e7c-940f-c664cdfcd644-c000.json
part-00001-ff1a03b2-b55f-4e7c-940f-c664cdfcd644-c000.json
part-00002-ff1a03b2-b55f-4e7c-940f-c664cdfcd644-c000.json


In [15]:
! cat output/json/part-00000-ff1a03b2-b55f-4e7c-940f-c664cdfcd644-c000.json

{"id":2,"account":"alex","dt_transaction":"2019-02-01","balance":1500}
{"id":3,"account":"alex","dt_transaction":"2019-03-01","balance":1700}


In [17]:
write_csv=(df3
           .write
           .format("json")
           .mode("append") # append
           .save("output/json")
          )

In [18]:
read_json=spark.read.format("json").load("output/json")

read_json.printSchema()
read_json.show()

root
 |-- account: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- dt_transaction: string (nullable = true)
 |-- id: long (nullable = true)
 |-- new: string (nullable = true)

+-------+-------+--------------+---+-------------+
|account|balance|dt_transaction| id|          new|
+-------+-------+--------------+---+-------------+
|   alex|   1500|    2019-02-01|  2|         null|
|   alex|   1700|    2019-03-01|  3|         null|
|   otto|   4444|    2019-10-01|  1|neue Spalte 1|
|  maria|   5000|    2020-01-01|  4|         null|
|   alex|   1000|    2019-01-01|  1|         null|
+-------+-------+--------------+---+-------------+



* Kein Schema
* Neue Spalten werden als neues Attribut hinzugefügt

## Avro

In [19]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_avro=(df1
           .write
           .format("avro")
           .mode("overwrite") # append
           .save("output/avro")
          )

Number of Partitions: 3


In [20]:
!ls output/avro/

_SUCCESS
part-00000-3b3b12df-ccb3-42fc-8157-ac9b5431fb2b-c000.avro
part-00001-3b3b12df-ccb3-42fc-8157-ac9b5431fb2b-c000.avro
part-00002-3b3b12df-ccb3-42fc-8157-ac9b5431fb2b-c000.avro


In [21]:
! cat output/avro/part-00000-3b3b12df-ccb3-42fc-8157-ac9b5431fb2b-c000.avro

Objavro.schema�{"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":["long","null"]},{"name":"account","type":["string","null"]},{"name":"dt_transaction","type":[{"type":"int","logicalType":"date"},"null"]},{"name":"balance","type":["long","null"]}]}0org.apache.spark.version
3.1.2avro.codec
snappy D0
�
.\@6-@!�&%H
t  alex �� �  alex Ș �3���D0
�
.\@6-@!�&%

In [22]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)



In [23]:
write_avro=(df3
           .write
           .format("avro")
           .mode("append") # append
           .save("output/avro")
          )

In [24]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()
read_json.show()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)
 |-- new: string (nullable = true)

+---+-------+--------------+-------+-------------+
| id|account|dt_transaction|balance|          new|
+---+-------+--------------+-------+-------------+
|  1|   otto|    2019-10-01|   4444|neue Spalte 1|
|  2|   alex|    2019-02-01|   1500|         null|
|  3|   alex|    2019-03-01|   1700|         null|
|  4|  maria|    2020-01-01|   5000|         null|
|  1|   alex|    2019-01-01|   1000|         null|
+---+-------+--------------+-------+-------------+



* Schema erhalten
* Schema evolution 

## Parquet

In [25]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_parquet=(df1
           .write
           .partitionBy("account")
           .format("parquet")
           .mode("overwrite") # append
           .save("output/parquet")
          )


Number of Partitions: 3


In [26]:
df1.show()

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  1|   alex|    2019-01-01|   1000|
|  4|  maria|    2020-01-01|   5000|
+---+-------+--------------+-------+



In [29]:
!ls  output/parquet/
!echo "----------------------------"
!ls output/parquet/account=alex

_SUCCESS      [34maccount=alex[m[m  [34maccount=maria[m[m
----------------------------
part-00000-f4019e07-a8c8-499e-a98d-eb75706f3b0f.c000.snappy.parquet
part-00001-f4019e07-a8c8-499e-a98d-eb75706f3b0f.c000.snappy.parquet


In [30]:
! cat output/parquet/account=alex/part-00000-f4019e07-a8c8-499e-a98d-eb75706f3b0f.c000.snappy.parquet

PAR1 ,., 
               (                    ,            
 , 
$F  F   ($F  F     4   F  $F   ,0, 
�      �       (�      �         T   �      �      LH
spark_schema %id %dt_transaction%
 %balance 
<&
5 id��&<               (               
     &�
5 dt_transactionvz&�<$F  F   ($F  F   
     &�
5 balance��&�<�      �       (�      �       
     � ,org.apache.spark.version3.1.2 )org.apache.spark.sql.parquet.row.metadata�{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"dt_transaction","type":"date","nullable":true,"metadata":{}},{"name":"balance","type":"long","nullable":true,"metadata":{}}]} Jparquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)<
  
  
   �  PAR1

In [33]:
read_parquet=spark.read.format("parquet").load("output/parquet/").filter(col("account")=="alex").show()

+---+--------------+-------+-------+
| id|dt_transaction|balance|account|
+---+--------------+-------+-------+
|  2|    2019-02-01|   1500|   alex|
|  3|    2019-03-01|   1700|   alex|
|  1|    2019-01-01|   1000|   alex|
+---+--------------+-------+-------+



## Delta

In [65]:
write_delta=(df1
           .write
           .format("delta")
           .option("mergeSchema", "true")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [48]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
! cat output/delta/_delta_log/00000000000000000002.json

[34m_delta_log[m[m
part-00000-63ddc6e6-20a0-4bce-948f-33896396b51f-c000.snappy.parquet
part-00000-8587accb-32ea-493b-9407-4e9cbc1ca156-c000.snappy.parquet
part-00000-8888c4f1-214a-4356-a253-57cfc4886df8-c000.snappy.parquet
part-00001-4996de49-d113-4690-a72c-ad42da85ebda-c000.snappy.parquet
part-00002-0e0a1bc0-c7cd-48e0-811d-b2724c02f387-c000.snappy.parquet
part-00002-3e99d576-e3a1-4627-bdcd-3fa0dc6b425b-c000.snappy.parquet
part-00002-78a5675b-06af-41b0-a31c-2380898cdad4-c000.snappy.parquet
-------------------------------------------------------------------
00000000000000000000.json 00000000000000000001.json 00000000000000000002.json
{"commitInfo":{"timestamp":1670243456445,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":1,"isBlindAppend":false,"operationMetrics":{"numFiles":"2","numOutputBytes":"2077","numOutputRows":"1"}}}
{"metaData":{"id":"e4afc96f-1279-482c-906d-ad93c40b2296","format":{"provider":"parquet","options":{}},"schemaStri

In [40]:
write_delta=(df2
           .write
           .format("delta")
           .mode("append") # append
           .save("output/delta")
          )



In [44]:
write_delta=(df3
           .write
           .format("delta")
           .option("mergeSchema", "true")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [None]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
#! cat output/delta/_delta_log/00000000000000000005.json

In [49]:
deltaTable = DeltaTable.forPath(spark, "output/delta")

fullHistoryDF = deltaTable.history()    # get the full history of the table

In [50]:
fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics","userMetadata").show(truncate=True)

+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|version|readVersion|          timestamp|userId|operation| operationParameters|    operationMetrics|userMetadata|
+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|      2|          1|2022-12-05 13:30:56|  null|    WRITE|{mode -> Overwrit...|{numFiles -> 2, n...|        null|
|      1|          0|2022-12-05 13:29:29|  null|    WRITE|{mode -> Append, ...|{numFiles -> 2, n...|        null|
|      0|       null|2022-12-05 13:27:41|  null|    WRITE|{mode -> Overwrit...|{numFiles -> 3, n...|        null|
+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+



In [58]:
read_parquet=spark.read.format("delta").load("output/delta/").show()

+---+-------+--------------+-------+----+
| id|account|dt_transaction|balance| new|
+---+-------+--------------+-------+----+
|  2|   alex|    2019-02-01|   1500|null|
|  3|   alex|    2019-03-01|   1700|null|
|  4|  maria|    2020-01-01|   5000|null|
|  1|   alex|    2019-01-01|   1000|null|
+---+-------+--------------+-------+----+



## Time travel

In [55]:
spark.read.format("delta").option("versuinAsOf", "0").load("output/delta").show()


+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  4|  maria|    2020-01-01|   5000|
|  2|  peter|    2021-01-01|    100|
|  1|   alex|    2019-01-01|   1000|
|  1|   alex|    2019-03-01|   3300|
+---+-------+--------------+-------+



## Merge

In [66]:
deltaTable2 = DeltaTable.forPath(spark, "output/delta")

df2.show()
deltaTable2.toDF().show()

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  1|   alex|    2019-03-01|   3300|
|  2|  peter|    2021-01-01|    100|
+---+-------+--------------+-------+

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  4|  maria|    2020-01-01|   5000|
|  1|   alex|    2019-01-01|   1000|
+---+-------+--------------+-------+



In [67]:
dt3=(deltaTable2.alias("oldData")
      .merge(df2.alias("newData"),
            "oldData.account = newData.account AND oldData.dt_transaction = newData.dt_transaction")
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
      .execute()
    )

deltaTable2.toDF().show()

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|  peter|    2021-01-01|    100|
|  4|  maria|    2020-01-01|   5000|
|  1|   alex|    2019-01-01|   1000|
|  1|   alex|    2019-03-01|   3300|
|  2|   alex|    2019-02-01|   1500|
+---+-------+--------------+-------+



In [73]:
result=(deltaTable2
        .toDF()
        .withColumn("month",month(col("dt_transaction")))
        .groupBy("account","month").agg(sum("balance"))
        .sort("account","month")
       )
result.show()

+-------+-----+------------+
|account|month|sum(balance)|
+-------+-----+------------+
|   alex|    1|        1000|
|   alex|    2|        1500|
|   alex|    3|        3300|
|  maria|    1|        5000|
|  peter|    1|         100|
+-------+-----+------------+



In [75]:
result=(spark.read
        .format("delta")
        .option("versionAsOf", "1")
        .load("output/delta")
        .withColumn("month",month(col("dt_transaction")))
        .groupBy("account","month").agg(sum("balance"))
        .sort("account","month")
       )
result.show()

+-------+-----+------------+
|account|month|sum(balance)|
+-------+-----+------------+
|   alex|    1|        1000|
|   alex|    2|        1500|
|   alex|    3|        3300|
|  maria|    1|        5000|
|  peter|    1|         100|
+-------+-----+------------+



* Schema
* Schema evolution
* Transaction Log
* Time Travel

In [None]:
spark.stop()