# Chapter 9. Building Reliable Data Lakes with Apache Spark

## Building Lakehouses with Apache Spark and Delta Lake

### Configuring Apache Spark with Delta Lake

In [None]:
from uuid import uuid1
from time import sleep
import random

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (SparkSession.builder
  # Add Maven coordinates of the Delta Lake jars as described in https://docs.delta.io/latest/quick-start.html#maven
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
  # Configure Delta Lake
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
  .master("local[4]")
  .appName("DeltaLakes")
  .getOrCreate())
spark

### Loading Data into a Delta Lake Table

NOTE: Make sure that there is no a delta table in the directory specified as `deltaPath`. Otherwise, delete the directory by executing `rm data_output/loans_delta -rf`

In [None]:
spark.sql("set spark.sql.shuffle.partitions = 1")

# Source data path
sourcePath = "../data/loans/loan-risks.snappy.parquet"
# Delta Lake path
deltaPath = "../data_output/loans_delta"
# Create the Delta Lake table with the same loans data
spark.read.format("parquet").load(sourcePath).write.format("delta").save(deltaPath)
# Create a view on the data called loans_delta
spark.read.format("delta").load(deltaPath).createOrReplaceTempView("loans_delta")

In [None]:
# Read and explore the data as easily as any other table
spark.sql("select count(*) from loans_delta").show()
spark.sql("select * from loans_delta limit 5").show()

### Loading Data Streams into a Delta Lake Table

In [None]:
@F.udf(returnType="string")
def random_state():
  states = ["CA", "TX", "NY", "WA"]
  return str(random.choice(states))

newLoanStreamDF = (spark.readStream.format("rate").option("rowsPerSecond", 5).load()
    .withColumn("loan_id", 10000 + F.col("value"))
    .withColumn("funded_amnt", (F.rand() * 5000 + 5000).cast("integer"))
    .withColumn("paid_amnt", F.col("funded_amnt") - (F.rand() * 2000))
    .withColumn("addr_state", random_state())
    .select("loan_id", "funded_amnt", "paid_amnt", "addr_state"))

In [None]:
checkpointDir = f"/tmp/spark-streaming-checkpoints-{uuid1()}"
trigger_processing_time = 2
streamingQuery = (newLoanStreamDF.writeStream 
    .format("delta") 
    .option("checkpointLocation", checkpointDir) 
    .trigger(processingTime = f"{trigger_processing_time} seconds") 
    .start(deltaPath))

In [None]:
# Run this query several times with a pause inbetween to see that the row count changes,
# because the streaming query is writing to the delta table.
sleep(trigger_processing_time)
spark.table("loans_delta").count()

In [None]:
streamingQuery.stop()
print("Status of streamingQuery:", streamingQuery.status)
print("Active streams:", spark.streams.active)

### Enforcing Schema on Write to Prevent Data Corruption

Trying to write some data with a schema inconsisten with that of the delta table. The new data has an additional column `closed`.

In [None]:
# Take two entries form the `loans_delta` table and add the `closed` column to them.
cols = ['loan_id', 'funded_amnt', 'paid_amnt', 'addr_state', 'closed']
items = [
    (1111111, 1000, 1000.0, 'TX', True), 
    (2222222, 2000, 0.0, 'CA', False)
]

loanUpdates = (spark.createDataFrame(items, cols)
               .withColumn("funded_amnt", F.col("funded_amnt").cast("int")))
loanUpdates.show()

In [None]:
# Try to write the two entries with an extra column to the `loans_delta` table.
# See the write failing with an `AnalysisException` because of a schema mismatch.
loanUpdates.write.format("delta").mode("append").save(deltaPath)

### Evolving Schemas to Accommodate Changing Data

In [None]:
# Use the `mergeSchema` option to add the entries with an extra column
# and to update the table schema correspondingly.
loanUpdates.write.format("delta").mode("append").option("mergeSchema", True).save(deltaPath)

In [None]:
# To see the new column `closed` we have to reread the delta table.
loans_delta = spark.read.format("delta").load(deltaPath)
loans_delta.createOrReplaceTempView("loans_delta")
spark.sql("select * from loans_delta").show()

### Transforming Existing Data

#### Updating data

In [None]:
from delta.tables import DeltaTable

In [None]:
loans_delta.where(F.expr("addr_state = 'OR'")).show(2)

In [None]:
deltaTable = DeltaTable.forPath(spark, deltaPath)
deltaTable.update("addr_state = 'OR'",  {"addr_state": "'WA'"})

In [None]:
loans_delta.where(F.expr("addr_state = 'OR'")).show(2)

#### Deleting data

In [None]:
loans_delta.where(F.expr("funded_amnt <= paid_amnt")).show(2)

In [None]:
deltaTable.delete("funded_amnt <= paid_amnt")

In [None]:
loans_delta.where(F.expr("funded_amnt <= paid_amnt")).show(2)

#### Upserting data using `merge()`

In [None]:
new_items_id = [str(i) for i in next(zip(*items))]
print(new_items_id)

In [None]:
loans_delta.where(loans_delta.loan_id.isin(new_items_id)).show()

In [None]:
(deltaTable
 .alias("t")
 .merge(loanUpdates.alias("s"), "t.loan_id = s.loan_id")
 .whenMatchedUpdateAll()
 .whenNotMatchedInsertAll()
 .execute())

In [None]:
loans_delta.where(loans_delta.loan_id.isin(new_items_id)).show()

#### Deduplicating data while inserting using insert-only merge

In [None]:
loans_delta.count()

In [None]:
(deltaTable
 .alias("t")
 .merge(loanUpdates.alias("s"), "t.loan_id = s.loan_id")
 .whenNotMatchedInsertAll()
 .execute())

In [None]:
# The count is the same as before the merge, because the dupplicate records
# were not inserted.
loans_delta.count()

### Auditing Data Changes with Operation History

In [None]:
# Query the table's operation history
deltaTable.history().show()

In [None]:
# Show the important columns from the history.
# Columns `operation` and `operationParameters` are useful for auditing the changes.
(deltaTable
  .history(3)
  .select("version", "timestamp", "operation", "operationParameters")
  .show(truncate=False))

### Querying Previous Snapshots of a Table with Time Travel

In [None]:
# Get a previous version of the delta table by a commit timestamp.
second_last_commit_timestamp = deltaTable.history(2).orderBy("version", descending=True).first().timestamp.isoformat()
(spark.read
  .format("delta")
  .option("timestampAsOf", second_last_commit_timestamp)
  .load(deltaPath)).show(4)

In [None]:
# Get a previous version of the delta table by a version number.
(spark.read.format("delta")
  .option("versionAsOf", "0")
  .load(deltaPath)).show(4)