https://github.com/delta-io/delta/blob/master/examples/cheat_sheet/delta_lake_cheat_sheet.pdf

# Batch

## Read
* By File path
  * df = spark.read.format("parquet"|"csv"|"json"|etc.).load('path to delta table')
* By Table
  * df = spark.table('delta table name')

## Write
* By File path
  * df.write.format("delta").mode("overwrite"|"append").partitionBy('field').save('path to delta table')
* By Table
  * df.write.format("delta").option('mergeSchema', "true").saveAsTable('delta table name')

# Streaming

## Read
* By File path
  * df = spark.readStream.format("parquet"|"csv"|"json"|etc.).schema(schema).load('path to delta table')
* By Table
  * df = spark.readStream.format("delta").table('delta table name')

## Write

* By File path
  * df.writeStream.format("delta").outputMode("append"|"update"|"complete").option("checkpointLocation', 'path to chkpoint').trigger(once=True|processingTime="x minute").start('path to delta table')
* By Table
  * df.writeStream.format("delta").outputMode("append"|"update"|"complete").option("checkpointLocation', 'path to chkpoint').trigger(once=True|processingTime="x minute").table('delta table name')

# Utility Functions

In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

# Function to upsert microBatchOutputDF into Delta Lake table using merge
def upsertToDelta(microBatchOutputDF, batchId):
    t = deltaTable.alias("t").merge(microBatchOutputDF.alias("s"), "s.id = t.id")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

# Streaming Types
* File Based
  * File lands on disk & is streamed from storage
* Event Based
  * More real-time and leverages a sstreaming service ssuch as Kafka, Kinesis, Eventhub

In [0]:
import shutil
shutil.rmtree("/tmp/ch-4/", ignore_errors=True)
dbutils.fs.rm("/tmp/ch-4/", True)

## File Based

In [0]:
import random
# Create a table(key, value) of some data
data = spark.range(8)
data = data.withColumn("value", data.id + random.randint(0, 5000))
data.write.format("delta").save("/tmp/ch-4/delta-table")

In [0]:
%sql
SELECT * from delta.`/tmp/ch-4/delta-table` LIMIT 5

id,value
0,1601
2,1603
3,1604
4,1605
1,1602


In [0]:
streamingDf = spark.readStream.format("rate").load()
#display(streamingDf)

In [0]:
# Stream writes to the table
stream = streamingDf.selectExpr("value as id").writeStream\
    .format("delta")\
    .option("checkpointLocation", "/tmp/ch-4/checkpoint")\
    .start("/tmp/ch-4/delta-table2")
stream.awaitTermination(10)
stream.stop()

In [0]:
%sql
SELECT * from delta.`/tmp/ch-4/delta-table2` LIMIT 5

id
1
3
4
0
2


In [0]:
# Stream reads from a table
stream2 = spark.readStream.format("delta").load("/tmp/ch-4/delta-table2")\
    .writeStream\
    .format("console")\
    .start()
stream2.awaitTermination(10)
stream2.stop()

### In-Stream Trasformations

In [0]:
# In-stream transformations
streamingAggregatesDF = spark.readStream.format("rate").load()\
    .withColumn("id", col("value") % 10)\
    .drop("timestamp")

In [0]:
# Write the output of a streaming aggregation query into Delta Lake table
deltaTable = DeltaTable.forPath(spark, "/tmp/ch-4/delta-table")
print("Before")
deltaTable.toDF().show()

stream3 = streamingAggregatesDF.writeStream\
    .format("delta") \
    .foreachBatch(upsertToDelta) \
    .outputMode("update") \
    .start()
stream3.awaitTermination(10)
stream3.stop()

print("After")
deltaTable.toDF().show()

### Delta table as both a streaming source & sink

In [0]:
from_tbl = "/tmp/ch-4/from_delta"
to_tbl = "/tmp/ch-4/to_delta"
numRows = 10
spark.range(numRows).write.mode("overwrite").format("delta").save(from_tbl)
spark.read.format("delta").load(from_tbl).show()
spark.range(numRows, numRows * 10).write.mode("overwrite").format("delta").save(to_tbl)

stream4 = spark.readStream.format("delta").load(to_tbl).writeStream.format("delta")\
    .option("checkpointLocation", "/tmp/ch-4/checkpoint/tbl1") \
    .outputMode("append") \
    .start(from_tbl)

In [0]:
# repartition table while streaming job is running
spark.read.format("delta").load(to_tbl).repartition(10).write\
    .format("delta")\
    .mode("overwrite")\
    .option("dataChange", "false")\
    .save(to_tbl)

stream4.awaitTermination(10)
stream4.stop()
#After streaming write 
spark.read.format("delta").load(from_tbl).show(5)

## Event Based

### Kafka
* Replace host/port & topic name

In [0]:
# Example of reading from Kafka 
'''
from pyspark.sql.functions import col 
kafkaServer = "<host:port>"                        # Specify Host & Port & the name of the topic 
topicName = 'iot-topic' 

iotData = (spark.readStream                         # Get the DataStreamReader 
  .format("kafka")                                  # Specify the source format as "kafka" 
  .option("kafka.bootstrap.servers", kafkaServer)   # Configure the Kafka server name and port 
  .option("subscribe", topicName)                   # Subscribe to the Kafka topic 
  .option("startingOffsets", "latest")              # stream to latest when we restart notebook 
  .option("maxOffsetsPerTrigger", 1000)             # Throttle Kafka's processing of the streams 
  .load() 
  .repartition(8) 
  .select(col("value").cast("STRING")) 
) 
'''

### Kinesis
* Replace Kinesis instance details - streamName, region
* The following example assumes incoming data to have following schema
  * id, user_id, device_id, num_steps, miles_walked, calories_burnt, timestamp

In [0]:
# Example of reading from Kinesis to pick up iot device data reporting on a user’s fitness metrics 
'''
from pyspark.sql.functions import * 
kinesisDF = spark.readStream \ 
  .format("kinesis") \ 
  .option("streamName", "kinesis-stream") \ 
  .option("region", "us-east-2") \
  .option("initialPosition", "trim_horizon") \
  .load() 

dataDF = kinesisDF.select(col("data").cast('string').alias("data"))
dataDF.createOrReplaceTempView("stream_data") 

stream_df = spark.sql("""
    SELECT data:id,  
    data:user_id, 
    data:device_id, 
    cast(data:num_steps as int) as num_steps,  
    cast(data:miles_walked as double) as miles_walked,  
    cast(data:calories_burnt as double) as calories_burnt, 
    cast(data:timestamp as timestamp) as timestamp
    FROM stream_data""") 

(stream_df .writeStream.format("delta") 
   .trigger(processingTime='30 seconds') 
   .option("checkpointLocation", <checkpoint location in storage>)
   .outputMode("append")
   .table("device_data_streaming")) 
'''

In [0]:
'''
#hold the lookup table in a dataframe
devices_df = spark.table("devices_lookup_tbl") 

#read the incoming iot data 
iot_df = spark.readStream() …. 

#join the 2 dataframes on device identifier 
join_df = iot_df.join(devices_df, [‘device_id])

#persist to disk
join_df.writeStream \ 
  .format('delta') \ 
  .outputMode('append') \ 
  .option('checkpointLocation', checkpoint_path) \ 
  .toTable("”devices_iot_tbl") 

# At this point if new devices get registered, the devices_df will get the updates 
# Pre-delta, this would be stale data and would require the user to re-read the table each time prior to a join

uniqueVisitors =  iot_df
     .withWatermark("event_time", "10 minutes") 
     .dropDuplicates("event_time", "uid") 
'''