# Demo and Comparission of Big Data File Formats
This Notebook runs only on a local Spark Environment, not on Kubernetes

## 1. CSV and JSON
Old dat formats that are not designed for big data and scaling  
**Typical feature:** humand readable

## 2. Avro, OCR, Parquet
First generation of special big data formats that allow fast writes, fast reads or both  
**Typical features:** splittable, compressible, data skipping and predicat pushdown, data schema inclueded



## 3. Delta, Iceberg, Hudi
Latest generation of big data format that support ACID transaction, audit save transaction logs and time travel  
**Typical features:** enhancing first generation format with additonal meta data and read/write procedures.

In [21]:
#################################################################################
# Load all relevant Python Modules
#################################################################################
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession

import pyspark.sql.functions as f
from pyspark.sql.functions import *
import json
import csv
from datetime import datetime
from delta import *
#import delta

# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [22]:
conf = SparkConf()

# configure Avro Connector
conf.set("spark.jars", "/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.driver.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")
conf.set("spark.executor.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")

# configure Delta Connector
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
conf.set("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0")

# configure Iceberg Connector
conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
conf.set("spark.sql.catalog.spark_catalog","org.apache.iceberg.spark.SparkSessionCatalog")
conf.set("spark.jars.packages", "org.apache.iceberg:iceberg-core:1.2.0")


spark = SparkSession.builder.appName("FileFormatDemo").config(conf=conf).getOrCreate()

sqlContext = SparkSession(spark)
#Dont Show warning only error
spark.sparkContext.setLogLevel("ERROR")

In [23]:
spark

In [10]:
# initial daten
account_data1 = [
    (1,"alex","2019-01-01",1000),
    (2,"alex","2019-02-01",1500),
    (3,"alex","2019-03-01",1700),
    (4,"maria","2020-01-01",5000)
    ]

# update mit Änderung und neuem Datensat
account_data2 = [
    (1,"alex","2019-03-01",3300),
    (2,"peter","2021-01-01",100)
    ]

# Update mit neuer Spalte
account_data3 = [
    (1,"otto","2019-10-01",4444,"neue Spalte 1")
]

schema = ["id","account","dt_transaction","balance"]
schema3 = ["id","account","dt_transaction","balance","new"]

df1 = spark.createDataFrame(data=account_data1, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df2 = spark.createDataFrame(data=account_data2, schema = schema).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)
df3 = spark.createDataFrame(data=account_data3, schema = schema3).withColumn("dt_transaction",col("dt_transaction").cast("date")).repartition(3)

print("++ create new dataframe and show schema and data")
print("################################################")

df1.printSchema()
df1.show(truncate=False)
df3.show(truncate=False)

++ create new dataframe and show schema and data
################################################
root
 |-- id: long (nullable = true)
 |-- account: string (nullable = true)
 |-- dt_transaction: date (nullable = true)
 |-- balance: long (nullable = true)

+---+-------+--------------+-------+
|id |account|dt_transaction|balance|
+---+-------+--------------+-------+
|2  |alex   |2019-02-01    |1500   |
|3  |alex   |2019-03-01    |1700   |
|1  |alex   |2019-01-01    |1000   |
|4  |maria  |2020-01-01    |5000   |
+---+-------+--------------+-------+

+---+-------+--------------+-------+-------------+
|id |account|dt_transaction|balance|new          |
+---+-------+--------------+-------+-------------+
|1  |otto   |2019-10-01    |4444   |neue Spalte 1|
+---+-------+--------------+-------+-------------+



In [11]:
print("Number of Partitions:", df1.rdd.getNumPartitions())

write_avro=(df1
           .write
           .format("avro")
           .mode("overwrite") # append
           .save("output/avro")
          )

Number of Partitions: 3


In [12]:
!ls output/avro/

_SUCCESS
part-00000-3da3e1e6-e463-4236-b8a9-c7b54cd2bd3b-c000.avro
part-00001-3da3e1e6-e463-4236-b8a9-c7b54cd2bd3b-c000.avro
part-00002-3da3e1e6-e463-4236-b8a9-c7b54cd2bd3b-c000.avro


In [13]:
write_delta=(df1
           .write
           .format("delta")
           .option("mergeSchema", "true")
           .mode("overwrite") # append
           .save("output/delta")
          )

In [14]:
!ls output/delta/

[34m_delta_log[m[m
part-00000-2017faee-4e09-4c52-b5eb-7a268090cf48-c000.snappy.parquet
part-00001-1c10ce50-4c61-481f-a468-29e9d6c1edc4-c000.snappy.parquet
part-00002-5693aaf1-8d73-47d4-9c2b-3430cd7336a0-c000.snappy.parquet


In [17]:
deltaTable = DeltaTable.forPath(spark, "output/delta")

fullHistoryDF = deltaTable.history()  

In [18]:
fullHistoryDF.select("version","readVersion","timestamp","userId","operation","operationParameters","operationMetrics","userMetadata").show(truncate=True)

+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|version|readVersion|          timestamp|userId|operation| operationParameters|    operationMetrics|userMetadata|
+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+
|      0|       null|2023-04-12 09:35:45|  null|    WRITE|{mode -> Overwrit...|{numFiles -> 3, n...|        null|
+-------+-----------+-------------------+------+---------+--------------------+--------------------+------------+



In [27]:
# write via hive to iceberg
write=(df1
      .write
      .format("iceberg")
      .mode("overwrite")
      .saveTo("output/iceberg")
      )

AttributeError: 'DataFrameWriter' object has no attribute 'saveTo'

In [20]:
spark.stop()