In [1]:
from pyspark.sql.session import SparkSession
from delta import *

In [2]:
builder = (
    SparkSession.builder.appName("Delta Lake Loans")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/06/10 15:13:44 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/06/10 15:13:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/wedivv/spark/spark-3.4.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/wedivv/.ivy2/cache
The jars for the packages stored in: /home/wedivv/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-98e6c6e7-9ac5-4742-86d1-e211b9e75f09;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
:: resolution report :: resolve 173ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |  

In [3]:
sourcePath = "./data/13-loan-risks.snappy.parquet"

# Delta Lake path
deltaPath = "./data/tmp/loans_delta"

In [4]:
# Create the Delta table with the same loans data

( 
    spark
    .read
    .format("parquet")
    .load(sourcePath)
    .write
    .format("delta")
    .save(deltaPath)
)

23/06/10 15:13:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
# Create a view on the data called loans_delta

(
    spark
    .read
    .format("delta")
    .load(deltaPath)
    .createOrReplaceTempView("loans_delta")
)

In [6]:
spark.sql("SELECT count(*) FROM loans_delta").show()



+--------+
|count(1)|
+--------+
|   14705|
+--------+



                                                                                

In [7]:
spark.sql("SELECT * FROM loans_delta").show()

                                                                                

+-------+-----------+---------+----------+
|loan_id|funded_amnt|paid_amnt|addr_state|
+-------+-----------+---------+----------+
|      0|       1000|   182.22|        CA|
|      1|       1000|   361.19|        WA|
|      2|       1000|   176.26|        TX|
|      3|       1000|   1000.0|        OK|
|      4|       1000|   249.98|        PA|
|      5|       1000|    408.6|        CA|
|      6|       1000|   1000.0|        MD|
|      7|       1000|   168.81|        OH|
|      8|       1000|   193.64|        TX|
|      9|       1000|   218.83|        CT|
|     10|       1000|   322.37|        NJ|
|     11|       1000|   400.61|        NY|
|     12|       1000|   1000.0|        FL|
|     13|       1000|   165.88|        NJ|
|     14|       1000|    190.6|        TX|
|     15|       1000|   1000.0|        OH|
|     16|       1000|   213.72|        MI|
|     17|       1000|   188.89|        MI|
|     18|       1000|   237.41|        CA|
|     19|       1000|   203.85|        CA|
+-------+--

### Enforcing Schema on Write to Prevent Data Corruption

In [8]:
from pyspark.sql.functions import *

# the column closed doesn't exist in the loans_delta table
cols = ['loan_id', 'funded_amnt', 'paid_amnt', 'addr_state', 'closed']

items = [
(1111111, 1000, 1000.0, 'TX', True),
(2222222, 2000, 0.0, 'CA', False)
]

In [9]:
loanUpdates = (
    spark.createDataFrame(items, cols)
    .withColumn("funded_amnt", col("funded_amnt").cast("int"))
)

In [10]:
(
    loanUpdates
    .write
    .format("delta")
    .mode("append")
    .save(deltaPath)
)

#   "name": "AnalysisException",
#   "message": "A schema mismatch detected when writing to the Delta table 
#  To enable schema migration using DataFrameWriter or DataStreamWriter, 
#  please set:\n'.option(\"mergeSchema\", \"true\")'.\n
#  For other operations, set the session configuration\nspark.databricks.delta.schema.autoMerge.enabled to \"true\".

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 6afec9b8-64ed-4ee2-af87-5eca91c24434).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- loan_id: long (nullable = true)
-- funded_amnt: integer (nullable = true)
-- paid_amnt: double (nullable = true)
-- addr_state: string (nullable = true)


Data schema:
root
-- loan_id: long (nullable = true)
-- funded_amnt: integer (nullable = true)
-- paid_amnt: double (nullable = true)
-- addr_state: string (nullable = true)
-- closed: boolean (nullable = true)

         

### Evolving Schemas to Accommodate Changing Data

In [11]:
(
    loanUpdates
    .write
    .format("delta")
    .mode("append")
    .option("mergeSchema", "true")
    .save(deltaPath)
)

                                                                                

In [12]:
df = spark.read.format("delta").load(deltaPath)

df.show()

+-------+-----------+---------+----------+------+
|loan_id|funded_amnt|paid_amnt|addr_state|closed|
+-------+-----------+---------+----------+------+
|      0|       1000|   182.22|        CA|  null|
|      1|       1000|   361.19|        WA|  null|
|      2|       1000|   176.26|        TX|  null|
|      3|       1000|   1000.0|        OK|  null|
|      4|       1000|   249.98|        PA|  null|
|      5|       1000|    408.6|        CA|  null|
|      6|       1000|   1000.0|        MD|  null|
|      7|       1000|   168.81|        OH|  null|
|      8|       1000|   193.64|        TX|  null|
|      9|       1000|   218.83|        CT|  null|
|     10|       1000|   322.37|        NJ|  null|
|     11|       1000|   400.61|        NY|  null|
|     12|       1000|   1000.0|        FL|  null|
|     13|       1000|   165.88|        NJ|  null|
|     14|       1000|    190.6|        TX|  null|
|     15|       1000|   1000.0|        OH|  null|
|     16|       1000|   213.72|        MI|  null|


### Transforming Existing Data

In [13]:
from delta.tables import *

In [14]:
deltaTable = DeltaTable.forPath(spark, deltaPath)

deltaTable.update("addr_state = 'OR'", {"addr_state": "'WA'"})

                                                                                

In [15]:
deltaTable = DeltaTable.forPath(spark, deltaPath)

deltaTable.delete("funded_amnt >= paid_amnt")

                                                                                

##### Upserting change data to a table using merge()

In [None]:
# say we have another table of new loan information, some of which
# are new loans and others of which are updates to existing loans.
# ( and same schema )

In [16]:
(
    deltaTable
    .alias("t")
    .merge(loanUpdates.alias("s"), "t.loan_id = s.loan_id")
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)

# DeltaTable.merge() operation, which is based on the MERGE SQL command

                                                                                

### Auditing Data Changes with Operation History

In [17]:
deltaTable.history().show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      4|2023-06-10 15:29:...|  null|    null|    MERGE|{predicate -> ["(...|null|    null|     null|          3|  Serializable|        false|{numTargetRowsCop...|        null|Apache-Spark/3.4....|
|      3|2023-06-10 15:26:...|  null|    null|   DELETE|{predicate -> ["(...|null|    null|     null|          2|  Serializable|        false|{numRemovedFiles ...|        null|Apache-Spark/3.4....|
|      2|2

In [18]:
(
    deltaTable
    .history(3)
    .select("version", "timestamp", "operation", "operationParameters")
    .show(truncate=False)
)

+-------+-----------------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|version|timestamp              |operation|operationParameters                                                                                                                                                                     |
+-------+-----------------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|4      |2023-06-10 15:29:14.007|MERGE    |{predicate -> ["(loan_id#2409L = loan_id#811L)"], matchedPredicates -> [{"actionType":"update"}], notMatchedPredicates -> [{"actionType":"insert"}], notMatchedBySourcePredicates -> []}|
|3      |2023-06-10 15:26:12.7  |DELETE   |{predicate -> ["(cast(funded_amnt#2410 as

### Querying Previous Snapshots of a Table with Time Travel

In [21]:
(    
    spark
    .read
    .format("delta")
    .load(deltaPath)
).show()

+-------+-----------+---------+----------+------+
|loan_id|funded_amnt|paid_amnt|addr_state|closed|
+-------+-----------+---------+----------+------+
|1111111|       1000|   1000.0|        TX|  true|
|2222222|       2000|      0.0|        CA| false|
+-------+-----------+---------+----------+------+



In [20]:
(
    spark
    .read
    .format("delta")
    .option("versionAsOf", "2")
    .load(deltaPath)
    ).show()

                                                                                

+-------+-----------+---------+----------+------+
|loan_id|funded_amnt|paid_amnt|addr_state|closed|
+-------+-----------+---------+----------+------+
|      0|       1000|   182.22|        CA|  null|
|      1|       1000|   361.19|        WA|  null|
|      2|       1000|   176.26|        TX|  null|
|      3|       1000|   1000.0|        OK|  null|
|      4|       1000|   249.98|        PA|  null|
|      5|       1000|    408.6|        CA|  null|
|      6|       1000|   1000.0|        MD|  null|
|      7|       1000|   168.81|        OH|  null|
|      8|       1000|   193.64|        TX|  null|
|      9|       1000|   218.83|        CT|  null|
|     10|       1000|   322.37|        NJ|  null|
|     11|       1000|   400.61|        NY|  null|
|     12|       1000|   1000.0|        FL|  null|
|     13|       1000|   165.88|        NJ|  null|
|     14|       1000|    190.6|        TX|  null|
|     15|       1000|   1000.0|        OH|  null|
|     16|       1000|   213.72|        MI|  null|


                                                                                