# Demo and Comparission of Big Data File Formats
This Notebook runs only on a local Spark Environment, not on Kubernetes

## 1. CSV and JSON
Old dat formats that are not designed for big data and scaling  
**Typical feature:** humand readable

## 2. Avro, OCR, Parquet
First generation of special big data formats that allow fast writes, fast reads or both  
**Typical features:** splittable, compressible, data skipping and predicat pushdown, data schema inclueded



## 3. Delta, Iceberg, Hudi
Latest generation of big data format that support ACID transaction, audit save transaction logs and time travel  
**Typical features:** enhancing first generation format with additonal meta data and read/write procedures.

In [1]:
#################################################################################
# Laden aller relevate Module
#################################################################################
import pyspark
from pyspark.sql.functions import *
import json
import csv
from datetime import datetime
from delta import *
import delta

# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [2]:
# first for local usage pip install delta-spark

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.jars",  "/Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.driver.extraClassPath","/Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.executor.extraClassPath","/Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")




spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark

## Create sample data

In [4]:
# initial daten
account_data1 = [
    (1,"alex","2019-01-01",1000),
    (2,"alex","2019-02-01",1500),
    (3,"alex","2019-03-01",1700),
    (4,"maria","2020-01-01",5000)
    ]

# update mit Änderung und neuem Datensat
account_data2 = [
    (1,"alex","2019-03-01",3300),
    (2,"peter","2021-01-01",100)
    ]

# Update mit neuer Spalte
account_data3 = [
    (1,"otto","2019-10-01",4444,"neue Spalte 1")
]

schema = ["id","account","dt_transaction","balance"]
schema3 = ["id","account","dt_transaction","balance","new"]

df1 = spark.createDataFrame(data=account_data1, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df2 = spark.createDataFrame(data=account_data2, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df3 = spark.createDataFrame(data=account_data3, schema = schema3).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)

print("++ create new dataframe and show schema and data")
print("################################################")

df1.printSchema()
df1.show(truncate=False)
df3.show(truncate=False)

++ create new dataframe and show schema and data
################################################
root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
|id |account|dt_transaction|balance|
+---+-------+--------------+-------+
|2  |alex   |2019-02-01    |1500   |
|3  |alex   |2019-03-01    |1700   |
|1  |alex   |2019-01-01    |1000   |
|4  |maria  |2020-01-01    |5000   |
+---+-------+--------------+-------+

+---+-------+--------------+-------+-------------+
|id |account|dt_transaction|balance|new          |
+---+-------+--------------+-------+-------------+
|1  |otto   |2019-10-01    |4444   |neue Spalte 1|
+---+-------+--------------+-------+-------------+



## CSV

In [5]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_csv=(df1
           .write
           .format("csv")
           .mode("overwrite") # append
           .save("output/csv")
          )


Number of Partitions: 3


In [6]:
!ls output/csv/

_SUCCESS
part-00000-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv
part-00001-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv
part-00002-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv


In [None]:
! cat output/csv/part-00002-ec2256a8-ce7d-4629-b968-78fd32cc337b-c000.csv

In [7]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



In [8]:
write_csv=(df3
           .write
           .format("csv")
           .mode("append") # append
           .save("output/csv")
          )

In [9]:
!ls output/csv/

_SUCCESS
part-00000-1a7710fb-4aad-466f-824f-e01e557c45a1-c000.csv
part-00000-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv
part-00001-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv
part-00002-1a7710fb-4aad-466f-824f-e01e557c45a1-c000.csv
part-00002-a4b2c68c-b0ea-462c-b90b-f1df7ae7432f-c000.csv


In [None]:
!cat output/csv/part-00002-ec2256a8-ce7d-4629-b968-78fd32cc337b-c000.csv

In [10]:
read_csv=spark.read.format("csv").load("output/csv")

read_csv.printSchema()
read_csv.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)

+---+-----+----------+----+
|_c0|  _c1|       _c2| _c3|
+---+-----+----------+----+
|  2| alex|2019-02-01|1500|
|  3| alex|2019-03-01|1700|
|  1| otto|2019-10-01|4444|
|  4|maria|2020-01-01|5000|
|  1| alex|2019-01-01|1000|
+---+-----+----------+----+



* kein Schema (Typen)
* kein anfügen neuer Spalten

## JSON

In [11]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_json=(df1
           .write
           .format("json")
           .mode("overwrite") # append
           .save("output/json")
          )


Number of Partitions: 3


In [12]:
!ls output/json/

_SUCCESS
part-00000-1fbeb738-5e9f-4030-83fd-7b23803cb6fc-c000.json
part-00001-1fbeb738-5e9f-4030-83fd-7b23803cb6fc-c000.json
part-00002-1fbeb738-5e9f-4030-83fd-7b23803cb6fc-c000.json


In [None]:
! cat output/json/part-00000-980c6e26-7124-40ba-816d-f8a76e82a796-c000.json

In [13]:
write_csv=(df3
           .write
           .format("json")
           .mode("append") # append
           .save("output/json")
          )

In [14]:
read_json=spark.read.format("json").load("output/json")

read_json.printSchema()
read_json.show()

root
 |-- account: string (nullable = true)
 |-- balance: long (nullable = true)
 |-- dt_transaction: string (nullable = true)
 |-- id: long (nullable = true)
 |-- new: string (nullable = true)

+-------+-------+--------------+---+-------------+
|account|balance|dt_transaction| id|          new|
+-------+-------+--------------+---+-------------+
|   alex|   1500|    2019-02-01|  2|         null|
|   alex|   1700|    2019-03-01|  3|         null|
|   otto|   4444|    2019-10-01|  1|neue Spalte 1|
|  maria|   5000|    2020-01-01|  4|         null|
|   alex|   1000|    2019-01-01|  1|         null|
+-------+-------+--------------+---+-------------+



* Kein Schema
* Neue Spalten werden als neues Attribut hinzugefügt

## Avro

In [15]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_avro=(df1
           .write
           .format("avro")
           .mode("overwrite") # append
           .save("output/avro")
          )

Number of Partitions: 3


In [16]:
!ls output/avro/

_SUCCESS
part-00000-ef4ebba4-291b-47ad-a18b-b9bfe58a3356-c000.avro
part-00001-ef4ebba4-291b-47ad-a18b-b9bfe58a3356-c000.avro
part-00002-ef4ebba4-291b-47ad-a18b-b9bfe58a3356-c000.avro


In [None]:
! cat output/avro/part-00000-5d8db30e-9134-4166-a9d3-f562c0ee0a46-c000.avro

In [18]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()
read_json.show()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  4|  maria|    2020-01-01|   5000|
|  1|   alex|    2019-01-01|   1000|
+---+-------+--------------+-------+



In [19]:
write_avro=(df3
           .write
           .format("avro")
           .mode("append") # append
           .save("output/avro")
          )

In [20]:
read_json=spark.read.format("avro").load("output/avro")
read_json.printSchema()
read_json.show()

root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
| id|account|dt_transaction|balance|
+---+-------+--------------+-------+
|  1|   otto|    2019-10-01|   4444|
|  2|   alex|    2019-02-01|   1500|
|  3|   alex|    2019-03-01|   1700|
|  4|  maria|    2020-01-01|   5000|
|  1|   alex|    2019-01-01|   1000|
+---+-------+--------------+-------+



* Schema erhalten
* Schema evolution 

## Parquet

In [21]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_parquet=(df1
           .write
           .partitionBy("account")
           .format("parquet")
           .mode("overwrite") # append
           .save("output/parquet")
          )


Number of Partitions: 3


In [22]:
!ls  output/parquet/
!echo "----------------------------"
!ls output/parquet/account=alex

_SUCCESS      [34maccount=alex[m[m  [34maccount=maria[m[m
----------------------------
part-00000-885ea86e-bdd8-41fc-9498-1cf9099c438c.c000.snappy.parquet
part-00001-885ea86e-bdd8-41fc-9498-1cf9099c438c.c000.snappy.parquet


In [None]:
! cat output/parquet/account=alex/part-00000-9ce7ae8e-705a-4f93-acc8-d8cca35d5945.c000.snappy.parquet

In [23]:
read_parquet=(spark
              .read.format("parquet")
              .load("output/parquet/")
              .filter(col("account")=="alex")
             )

read_parquet.printSchema()
read_parquet.show()

root
 |-- id: long (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)
 |-- account: string (nullable = true)

+---+--------------+-------+-------+
| id|dt_transaction|balance|account|
+---+--------------+-------+-------+
|  2|    2019-02-01|   1500|   alex|
|  3|    2019-03-01|   1700|   alex|
|  1|    2019-01-01|   1000|   alex|
+---+--------------+-------+-------+



## Delta

In [28]:
# clean old stuff
! rm -r output/delta
! ls -l output

rm: output/delta: No such file or directory
total 0
drwxr-xr-x  14 alor  staff  448 Mar 13 13:09 [34mavro[m[m
drwxr-xr-x  14 alor  staff  448 Mar 13 13:09 [34mcsv[m[m
drwxr-xr-x  14 alor  staff  448 Mar 13 13:09 [34mjson[m[m
drwxr-xr-x   6 alor  staff  192 Mar 13 13:10 [34mparquet[m[m


In [None]:
write_delta=(df1
           .write
           .format("delta")
           .option("mergeSchema", "false")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [None]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
! cat output/delta/_delta_log/00000000000000000000.json

In [None]:
write_delta=(df2
           .write
           .format("delta")
           .mode("append") # append
           .save("output/delta")
          )



In [None]:
write_delta=(df3
           .write
           .format("delta")
           .option("mergeSchema", "true")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [None]:
! ls output/delta
! echo "-------------------------------------------------------------------"
! ls output/delta/_delta_log
! cat output/delta/_delta_log/00000000000000000002.json

In [None]:
deltaTable = DeltaTable.forPath(spark, "output/delta")

fullHistoryDF = deltaTable.history()    # get the full history of the table

In [None]:
fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics","userMetadata").show(truncate=True)

In [None]:
read_parquet=spark.read.format("delta").load("output/delta/").show()

## Time travel

In [None]:
spark.read.format("delta").option("versionAsOf", "1").load("output/delta").show()


## Merge

In [None]:
deltaTable2 = DeltaTable.forPath(spark, "output/delta")


df2a=df2.withColumn("new",lit("test"))
df2a.show()
deltaTable2.toDF().show()

In [None]:
dt3=(deltaTable2.alias("oldData")
      .merge(df2a.alias("newData"),
            "oldData.account = newData.account AND oldData.dt_transaction = newData.dt_transaction")
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
      .execute()
    )

deltaTable2.toDF().show()

In [None]:
result=(deltaTable2
        .toDF()
        .withColumn("month",month(col("dt_transaction")))
        .groupBy("account","month").agg(sum("balance"))
        .sort("account","month")
       )
result.show()

In [None]:
result=(spark.read
        .format("delta")
        .option("versionAsOf", "1")
        .load("output/delta")
        .withColumn("month",month(col("dt_transaction")))
        .groupBy("account","month").agg(sum("balance"))
        .sort("account","month")
       )
result.show()

* Schema
* Schema evolution
* Transaction Log
* Time Travel

In [None]:
spark.stop()