# Configure SparkSession with Delta Lake configuration

In [5]:
%session_id_prefix native-delta-sql-
%glue_version 4.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
  "--datalake-formats": "delta"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Setting session ID prefix to native-delta-sql-
Setting Glue version to: 4.0
Current idle_timeout is None minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog', '--datalake-formats': 'delta'}


# Create a DataBase in Glue Data Catalog for experiment

In [1]:
%%sql
CREATE DATABASE IF NOT EXISTS deltalake_db

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: d69eb451-cebd-44c9-bc52-eabe66487201
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session d69eb451-cebd-44c9-bc52-eabe66487201 to get into ready status...
Session d69eb451-cebd-44c9-bc52-eabe66487201 has been created.
++
||
++
++


# Create a CSV Table pointing to popular Iris Dataset

Dataset Download link: https://www.kaggle.com/datasets/saurabh00007/iriscsv

In [2]:
spark.sql(f"""
CREATE TABLE deltalake_db.csv_table (
  id INT,
  SEPAL_LENGTH STRING,
  SEPAL_WIDTH STRING,
  PETAL_LENGTH STRING,
  PETAL_WIDTH STRING,
  CLASS_NAME STRING
)
USING csv
OPTIONS (
  path 's3://deltasparktesting/iris_data/',
  header 'true',
  inferSchema 'true',
  delimiter ','
)
""")


DataFrame[]


# Inspect the CSV Table

In [3]:
%%sql
select * from deltalake_db.csv_table limit 10;

+---+------------+-----------+------------+-----------+-----------+
| id|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH| CLASS_NAME|
+---+------------+-----------+------------+-----------+-----------+
|100|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|101|         4.9|          3|         1.4|        0.2|Iris-setosa|
|102|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|103|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|104|           5|        3.6|         1.4|        0.2|Iris-setosa|
|105|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|106|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|107|           5|        3.4|         1.5|        0.2|Iris-setosa|
|108|         4.4|        2.9|         1.4|      0.285|Iris-setosa|
|109|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
+---+------------+-----------+------------+-----------+-----------+


# Distinct classes available in the CSV Table

In [4]:
%%sql
select distinct CLASS_NAME from deltalake_db.csv_table limit 10;

+----------------+
|      CLASS_NAME|
+----------------+
|  Iris-virginica|
|     Iris-setosa|
| Iris-versicolor|
|another-category|
+----------------+


# Create the Delta Table

In [5]:
# create table in metastore
query = f"""
CREATE TABLE IF NOT EXISTS deltalake_db.delta_table (
  ID INT,
  SEPAL_LENGTH STRING,
  SEPAL_WIDTH STRING,
  PETAL_LENGTH STRING,
  PETAL_WIDTH STRING,
  CLASS_NAME STRING
)
USING delta
LOCATION 's3://deltasparktesting/iris_data_delta/'
"""

spark.sql(query)

DataFrame[]


# Check the created tables

In [6]:
%%sql
use deltalake_db

++
||
++
++


In [7]:
%%sql
SHOW TABLES

+------------+-----------+-----------+
|   namespace|  tableName|isTemporary|
+------------+-----------+-----------+
|deltalake_db|  csv_table|      false|
|deltalake_db|delta_table|      false|
+------------+-----------+-----------+


# Inspect the DeltaLake Table

In [8]:
%%sql # Read table from metastore
SELECT * FROM deltalake_db.delta_table;

+---+------------+-----------+------------+-----------+----------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|CLASS_NAME|
+---+------------+-----------+------------+-----------+----------+
+---+------------+-----------+------------+-----------+----------+


# Ingest the data in DeltaLake

In [9]:
%%sql
insert into deltalake_db.delta_table
select * from deltalake_db.csv_table;

++
||
++
++


# The concept used to transform CSV Files to Delta is explained in below video in-depth

https://youtu.be/mPkXQtE0fkQ?si=JZJc1_5jaVNMu-ru

# Inspect the Delta Lake Table

In [10]:
%%sql # Read table from metastore
SELECT * FROM deltalake_db.delta_table limit 10;

+---+------------+-----------+------------+-----------+--------------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|    CLASS_NAME|
+---+------------+-----------+------------+-----------+--------------+
|  1|         6.3|        3.3|           6|        2.5|Iris-virginica|
|  2|         5.8|        2.7|         5.1|        1.9|Iris-virginica|
|  3|         7.1|          3|         5.9|        2.1|Iris-virginica|
|  4|         6.3|        2.9|         5.6|        1.8|Iris-virginica|
|  5|         6.5|          3|         5.8|        2.2|Iris-virginica|
|  6|         7.6|          3|         6.6|        2.1|Iris-virginica|
|  7|         4.9|        2.5|         4.5|        1.7|Iris-virginica|
|  8|         7.3|        2.9|         6.3|        1.8|Iris-virginica|
|  9|         6.7|        2.5|         5.8|        1.8|Iris-virginica|
| 10|         7.2|        3.6|         6.1|        2.5|Iris-virginica|
+---+------------+-----------+------------+-----------+--------------+


# Distinct classes available in the Delta Table

In [11]:
%%sql
SELECT distinct CLASS_NAME FROM deltalake_db.delta_table;

+----------------+
|      CLASS_NAME|
+----------------+
|     Iris-setosa|
|  Iris-virginica|
| Iris-versicolor|
|another-category|
+----------------+


# Perform Delete operation & inspect the changes in DeltaLake

In [12]:
%%sql # Read table from metastore
Delete FROM deltalake_db.delta_table where CLASS_NAME='Iris-versicolor';

+-----------------+
|num_affected_rows|
+-----------------+
|               51|
+-----------------+


In [13]:
%%sql # Read table from metastore
select * FROM deltalake_db.delta_table where CLASS_NAME='Iris-versicolor';

+---+------------+-----------+------------+-----------+----------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|CLASS_NAME|
+---+------------+-----------+------------+-----------+----------+
+---+------------+-----------+------------+-----------+----------+


In [15]:
%%sql # Read table from metastore
select distinct class_name FROM deltalake_db.delta_table;

+----------------+
|      class_name|
+----------------+
|     Iris-setosa|
|  Iris-virginica|
|another-category|
+----------------+


# Perform Update operation & inspect the changes in DeltaLake

In [16]:
%%sql # Read table from metastore
Update  deltalake_db.delta_table set Class_Name='Testing update' where  
((ID>20 and ID<30) or (ID>104 and ID<110)) and CLASS_NAME <>'another-category';

+-----------------+
|num_affected_rows|
+-----------------+
|               14|
+-----------------+


# Read from Delta Lake table in spark sql

In [17]:
query = f"""SELECT distinct class_name FROM deltalake_db.delta_table""" # Using a version number
spark.sql(query).show(truncate=False)

+----------------+
|class_name      |
+----------------+
|another-category|
|Iris-virginica  |
|Testing update  |
|Iris-setosa     |
+----------------+


# Read from Delta Lake table in using `delta` format

In [18]:
df = spark.read.format("delta").load('s3://deltasparktesting/iris_data_delta/')
df.select('class_name').distinct().show()

+----------------+
|      class_name|
+----------------+
|     Iris-setosa|
|  Testing update|
|  Iris-virginica|
|another-category|
+----------------+


# Currently contributing files in Delta Table

In [4]:
from pyspark.sql.functions import *




In [20]:
df = spark.read.format("delta").load('s3://deltasparktesting/iris_data_delta/')
df.select('CLASS_NAME', input_file_name()).distinct().show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------+
|CLASS_NAME      |input_file_name()                                                                                         |
+----------------+----------------------------------------------------------------------------------------------------------+
|Iris-setosa     |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|another-category|s3://deltasparktesting/iris_data_delta/part-00002-cc1ae321-6d7b-466a-9454-a612f22874aa-c000.snappy.parquet|
|Iris-virginica  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.p

In [21]:
df.columns #you can perform all generic spark transformation

['ID', 'SEPAL_LENGTH', 'SEPAL_WIDTH', 'PETAL_LENGTH', 'PETAL_WIDTH', 'CLASS_NAME']


# View History 

In [22]:
query = f"""DESCRIBE HISTORY deltalake_db.delta_table """
spark.sql(query).show(truncate=False)

+-------+-------------------+------+--------+------------+--------------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId|userName|operation   |operationParameters                                                                                                                         |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                  |userMetadata|engineInfo                                |


In [28]:
query = f"""SELECT distinct class_name FROM deltalake_db.delta_table"""
spark.sql(query).show(truncate=False)

+----------------+
|class_name      |
+----------------+
|Iris-setosa     |
|Testing update  |
|Iris-virginica  |
|another-category|
+----------------+


# Perform Time Travel

In [29]:
df_previous = spark.read.format("delta").option("versionAsof", 1).load('s3://deltasparktesting/iris_data_delta/')




In [30]:
df_previous.select('class_name').distinct().show()

+----------------+
|      class_name|
+----------------+
|     Iris-setosa|
|another-category|
| Iris-versicolor|
|  Iris-virginica|
+----------------+


In [32]:
df_previous_2 = spark.read.format("delta").option("versionAsof", 0).load('s3://deltasparktesting/iris_data_delta/')
df_previous_2.select('class_name').distinct().show()

+----------+
|class_name|
+----------+
+----------+


In [None]:
%stop_session

# Another approach to read Delta Table from Catalog

In [1]:
df_products_read = spark.table("deltalake_db.delta_table")
df_products_read.show()

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 0b90dd84-fb05-47e6-849b-a921fe1a78e9
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 0b90dd84-fb05-47e6-849b-a921fe1a78e9 to get into ready status...
Session 0b90dd84-fb05-47e6-849b-a921fe1a78e9 has been created.
+---+------------+-----------+------------+-----------+--------------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|    CLASS_NAME|
+---+------------+-----------+------------+-----------+--------------+
|  1|         6.3|        3.3|           6|        2.5|Iris-virginica|
|  2|         5.8|        2.7|         5.1|        1.9|Iris-virginica|
|  3|         7.1|          3|         5.9|        2.1|Iris-virginica|
|  4|    

In [3]:
from pyspark.sql.functions import *




In [4]:
df_products_read.select('CLASS_NAME', input_file_name()).distinct().show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------+
|CLASS_NAME      |input_file_name()                                                                                         |
+----------------+----------------------------------------------------------------------------------------------------------+
|Iris-virginica  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
|another-category|s3://deltasparktesting/iris_data_delta/part-00002-cc1ae321-6d7b-466a-9454-a612f22874aa-c000.snappy.parquet|
|Iris-setosa     |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.p

# Use the Delta table path instead of the table name in the query. This directly reads the data from the path without checking the table statistics.

In [6]:
query = f"""SELECT * FROM delta.`s3://deltasparktesting/iris_data_delta/`"""
df_path=spark.sql(query)
df_path.show()

+---+------------+-----------+------------+-----------+--------------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|    CLASS_NAME|
+---+------------+-----------+------------+-----------+--------------+
|  1|         6.3|        3.3|           6|        2.5|Iris-virginica|
|  2|         5.8|        2.7|         5.1|        1.9|Iris-virginica|
|  3|         7.1|          3|         5.9|        2.1|Iris-virginica|
|  4|         6.3|        2.9|         5.6|        1.8|Iris-virginica|
|  5|         6.5|          3|         5.8|        2.2|Iris-virginica|
|  6|         7.6|          3|         6.6|        2.1|Iris-virginica|
|  7|         4.9|        2.5|         4.5|        1.7|Iris-virginica|
|  8|         7.3|        2.9|         6.3|        1.8|Iris-virginica|
|  9|         6.7|        2.5|         5.8|        1.8|Iris-virginica|
| 10|         7.2|        3.6|         6.1|        2.5|Iris-virginica|
| 11|         6.5|        3.2|         5.1|          2|Iris-virginica|
| 12| 

In [7]:
df2=df_path.select('CLASS_NAME', input_file_name().alias("input_file_name")).distinct()
df2.show(truncate=False)

+----------------+----------------------------------------------------------------------------------------------------------+
|CLASS_NAME      |input_file_name                                                                                           |
+----------------+----------------------------------------------------------------------------------------------------------+
|Iris-setosa     |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|Iris-virginica  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
|Testing update  |s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
|another-category|s3://deltasparktesting/iris_data_delta/part-00002-cc1ae321-6d7b-466a-9454-a612f22874aa-c000.snappy.p

# Writing Data in DeltaLake

In [8]:
# Create table in the metastore using DataFrame's schema and write data to it
df2.write.format("delta").mode("overwrite").option("path",'s3://deltasparktesting/iris_data_delta_2/') \
.saveAsTable("deltalake_db.delta_table_2")




In [9]:
df3=df2.filter(df2["CLASS_NAME"].isin("Iris-setosa","Iris-virginica"))
df3.show(truncate=False)

+--------------+----------------------------------------------------------------------------------------------------------+
|CLASS_NAME    |input_file_name                                                                                           |
+--------------+----------------------------------------------------------------------------------------------------------+
|Iris-setosa   |s3://deltasparktesting/iris_data_delta/part-00001-63dfe2ff-af0f-43b8-8aaf-d267b1edace4-c000.snappy.parquet|
|Iris-virginica|s3://deltasparktesting/iris_data_delta/part-00000-b1db4674-b819-4ee2-afa3-5fe7022375e0-c000.snappy.parquet|
+--------------+----------------------------------------------------------------------------------------------------------+


In [10]:
df3.write.format("delta").mode("overwrite").save('s3://deltasparktesting/iris_data_delta_2/')




# Read from Delta table via DeltaLake library

Learn more about this module here: https://docs.delta.io/latest/quick-start.html

In [3]:
from delta.tables import *




# Query table from metastore

In [2]:
deltaTable = DeltaTable.forName(spark, f"deltalake_db.delta_table")




In [3]:
deltaTable.show()

AttributeError: 'DeltaTable' object has no attribute 'show'


In [4]:
deltaTable.toDF().show()

+---+------------+-----------+------------+-----------+--------------+
| ID|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|    CLASS_NAME|
+---+------------+-----------+------------+-----------+--------------+
|  1|         6.3|        3.3|           6|        2.5|Iris-virginica|
|  2|         5.8|        2.7|         5.1|        1.9|Iris-virginica|
|  3|         7.1|          3|         5.9|        2.1|Iris-virginica|
|  4|         6.3|        2.9|         5.6|        1.8|Iris-virginica|
|  5|         6.5|          3|         5.8|        2.2|Iris-virginica|
|  6|         7.6|          3|         6.6|        2.1|Iris-virginica|
|  7|         4.9|        2.5|         4.5|        1.7|Iris-virginica|
|  8|         7.3|        2.9|         6.3|        1.8|Iris-virginica|
|  9|         6.7|        2.5|         5.8|        1.8|Iris-virginica|
| 10|         7.2|        3.6|         6.1|        2.5|Iris-virginica|
| 11|         6.5|        3.2|         5.1|          2|Iris-virginica|
| 12| 

In [5]:
df=deltaTable.toDF()
df.select("class_name").distinct().show()

+----------------+
|      class_name|
+----------------+
|     Iris-setosa|
|  Testing update|
|  Iris-virginica|
|another-category|
+----------------+


# Query table from s3 path

In [6]:
deltaTable_2 = DeltaTable.forPath(spark, 's3://deltasparktesting/iris_data_delta/') #query table from path
deltaTable_2.toDF().select("class_name").distinct().show()

+----------------+
|      class_name|
+----------------+
|     Iris-setosa|
|  Testing update|
|  Iris-virginica|
|another-category|
+----------------+


# Delete operation with Delta-Spark

In [7]:
# Declare the predicate by using a SQL-formatted string.
deltaTable_2.delete("id = '100'")




# Update operation with Delta-Spark

In [10]:
df5=deltaTable_2.toDF()
df5.filter(df5["id"]=='119').show(truncate=False)

+---+------------+-----------+------------+-----------+-----------+
|ID |SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|CLASS_NAME |
+---+------------+-----------+------------+-----------+-----------+
|119|5.1         |3.8        |1.5         |0.3        |Iris-setosa|
+---+------------+-----------+------------+-----------+-----------+


In [11]:
deltaTable_2.update(
    condition = "id = '119'",
    set = { "PETAL_LENGTH": "'1000'" }
)




In [12]:
df5=deltaTable_2.toDF()
df5.filter(df5["id"]=='119').show(truncate=False)

+---+------------+-----------+------------+-----------+-----------+
|ID |SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|CLASS_NAME |
+---+------------+-----------+------------+-----------+-----------+
|119|5.1         |3.8        |1000        |0.3        |Iris-setosa|
+---+------------+-----------+------------+-----------+-----------+


# Merge operation with Delta-Spark

In [1]:
from pyspark.sql import Row
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("delta").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_3/").saveAsTable(f"deltalake_db.delta_table_3")

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 61d2521e-3acc-477f-9005-0cf18f05d456
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 61d2521e-3acc-477f-9005-0cf18f05d456 to get into ready status...
Session 61d2521e-3acc-477f-9005-0cf18f05d456 has been created.
+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7384802013919556E9|
|     00002|  Thermostat|  400|Electronics|1.7384802013919556E9|
|     00003|  Television|  600|Electronics|1.7384802013919556E9|
|     00004|     Blender|  100|Electronics|1.

In [5]:
ut = time.time()

product_updates = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 400, 'category': 'Electronics', 'updated_at': ut}, # Update
    {'product_id': '00007', 'product_name': 'Chair', 'price': 50, 'category': 'Furniture','updated_at': ut} # Insert
]
df_product_updates = spark.createDataFrame(Row(**x) for x in product_updates)
df_product_updates.show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  400|Electronics|1.7384806456987453E9|
|     00007|       Chair|   50|  Furniture|1.7384806456987453E9|
+----------+------------+-----+-----------+--------------------+


In [4]:
deltaTable = DeltaTable.forName(spark, "deltalake_db.delta_table_3")
deltaTable.toDF().show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00005| USB charger|   50|Electronics|1.7384802013919556E9|
|     00002|  Thermostat|  400|Electronics|1.7384802013919556E9|
|     00003|  Television|  600|Electronics|1.7384802013919556E9|
|     00004|     Blender|  100|Electronics|1.7384802013919556E9|
|     00001|      Heater|  250|Electronics|1.7384802013919556E9|
+----------+------------+-----+-----------+--------------------+


# Sample SQL Merge statement
MERGE INTO target_table
USING source_table
ON merge_condition
WHEN MATCHED THEN
   UPDATE SET column1 = value1 [, column2 = value2 …]
WHEN NOT MATCHED THEN
   INSERT (column1 [, column2 …])
   VALUES (value1 [, value2 …]);
 

In [6]:
deltaTable.alias("products").merge(
    df_product_updates.alias("updates"),
    "products.product_id = updates.product_id") \
    .whenMatchedUpdate(set = {
        "product_name": "updates.product_name",
        "price": "updates.price",
        "category": "updates.category",
        "updated_at": "updates.updated_at"    } ) \
    .whenNotMatchedInsert(values = {
        "product_id": "updates.product_id",
        "product_name": "updates.product_name",
        "price": "updates.price",
        "category": "updates.category",
        "updated_at": "updates.updated_at"}
) \
.execute()




In [7]:
deltaTable.toDF().show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00005| USB charger|   50|Electronics|1.7384802013919556E9|
|     00002|  Thermostat|  400|Electronics|1.7384802013919556E9|
|     00003|  Television|  600|Electronics|1.7384802013919556E9|
|     00001|      Heater|  400|Electronics|1.7384806456987453E9|
|     00007|       Chair|   50|  Furniture|1.7384806456987453E9|
|     00004|     Blender|  100|Electronics|1.7384802013919556E9|
+----------+------------+-----+-----------+--------------------+


# Optimizing Delta Lake tables

In [1]:
from pyspark.sql import Row
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("delta").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_4/").saveAsTable(f"deltalake_db.delta_table_4")

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 136215b1-bd14-4f18-8eeb-a90dcdddf5e4
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 136215b1-bd14-4f18-8eeb-a90dcdddf5e4 to get into ready status...
Session 136215b1-bd14-4f18-8eeb-a90dcdddf5e4 has been created.
+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7389311233048344E9|
|     00002|  Thermostat|  400|Electronics|1.7389311233048344E9|
|     00003|  Television|  600|Electronics|1.7389311233048344E9|
|     00004|     Blender|  100|Electronics|1.

In [2]:
from delta.tables import *
deltaTable = DeltaTable.forPath(spark, "s3://deltasparktesting/iris_data_delta_4/")




In [3]:
deltaTable.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>>]


In [4]:
%%sql
OPTIMIZE delta.`s3://deltasparktesting/iris_data_delta_4/`

+--------------------+--------------------+
|                path|             metrics|
+--------------------+--------------------+
|s3://deltasparkte...|{0, 0, {null, nul...|
+--------------------+--------------------+


If you run OPTIMIZE twice on the same dataset, the second run will have no effect. This is also known as an idempotent operation.

In [5]:
%%sql
OPTIMIZE delta.`s3://deltasparktesting/iris_data_delta_4/`

+--------------------+--------------------+
|                path|             metrics|
+--------------------+--------------------+
|s3://deltasparkte...|{0, 0, {null, nul...|
+--------------------+--------------------+


It merges smaller files into bigger ones to enhance storage efficiency and performance. This means that if you read the data before and after running OPTIMIZE, you will get the same results.

In [6]:
df=spark.read.format('delta').load('s3://deltasparktesting/iris_data_delta_4/')
df.show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00005| USB charger|   50|Electronics|1.7389311233048344E9|
|     00002|  Thermostat|  400|Electronics|1.7389311233048344E9|
|     00003|  Television|  600|Electronics|1.7389311233048344E9|
|     00004|     Blender|  100|Electronics|1.7389311233048344E9|
|     00001|      Heater|  250|Electronics|1.7389311233048344E9|
+----------+------------+-----+-----------+--------------------+


OPTIMIZE is a Delta utility function that comes in two variants: Z-Order and bin-packing. The default is bin-packing.

What exactly is bin-packing? At a high level, this is a technique that is used to coalesce many small files into fewer large files across an arbitrary number of bins. A bin is defined as a file of a maximum file size (the default for Spark Delta Lake is 1 GB).

In [7]:
query = f"""DESCRIBE HISTORY deltalake_db.delta_table_4 """
spark.sql(query).show(truncate=False)

+-------+-------------------+------+--------+---------------------------------+------------------------------------------------------------------------------+----+--------+---------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId|userName|operation                        |operationParameters                                                           |job |notebook|clusterId|readVersion|isolationLevel   |isBlindAppend|operationMetrics                                                                                                                                                                                   |userMetadata|engineInfo                                |
+-------+-------------------+------+------

In [8]:
from pyspark.sql.functions import col
(DeltaTable.forName(spark, "deltalake_db.delta_table_4")
    .history(10)
    .where(col("operation") == "OPTIMIZE")
    .select("version", "timestamp", "operation", "operationMetrics.numRemovedFiles", "operationMetrics.numAddedFiles")
    .show(truncate=False))

+-------+-------------------+---------+---------------+-------------+
|version|timestamp          |operation|numRemovedFiles|numAddedFiles|
+-------+-------------------+---------+---------------+-------------+
|1      |2025-02-07 12:31:51|OPTIMIZE |6              |1            |
+-------+-------------------+---------+---------------+-------------+


# Schema Evolution

Delta Lake utilizes the technique from transactional Data Warehouses called schema-on-write. This simply means that, there is a process in place to check the schema of the write against the table prior to a write operation being executed. 

In [7]:
%stop_session

Stopping session: 0c7ad097-0a9b-4890-bd20-9ef0a296d7bc
Stopped session.


In [1]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("delta").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_5/").saveAsTable(f"deltalake_db.delta_table_5")

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: native-delta-sql--dde9c6d8-d7f9-46ea-ab95-3c97b064f3d9
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session native-delta-sql--dde9c6d8-d7f9-46ea-ab95-3c97b064f3d9 to get into ready status...
Session native-delta-sql--dde9c6d8-d7f9-46ea-ab95-3c97b064f3d9 has been created.
+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7390275267700424E9|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|
|     00003|  Television|  600|Electronics|1.73902752677

In [2]:
df=spark.read.format('delta').load('s3://deltasparktesting/iris_data_delta_5/')
df.show()

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|
|     00003|  Television|  600|Electronics|1.7390275267700424E9|
|     00004|     Blender|  100|Electronics|1.7390275267700424E9|
|     00001|      Heater|  250|Electronics|1.7390275267700424E9|
+----------+------------+-----+-----------+--------------------+


In [3]:
df2=df.withColumn('one_dummy_column',lit(5))
df2.show()

+----------+------------+-----+-----------+--------------------+----------------+
|product_id|product_name|price|   category|          updated_at|one_dummy_column|
+----------+------------+-----+-----------+--------------------+----------------+
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|               5|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|               5|
|     00003|  Television|  600|Electronics|1.7390275267700424E9|               5|
|     00004|     Blender|  100|Electronics|1.7390275267700424E9|               5|
|     00001|      Heater|  250|Electronics|1.7390275267700424E9|               5|
+----------+------------+-----+-----------+--------------------+----------------+


In [4]:
df2.write.format("delta").mode("append").option("path","s3://deltasparktesting/iris_data_delta_5/").saveAsTable(f"deltalake_db.delta_table_5")

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: ec4bbf00-e9bc-43b4-b0d0-31865c183802).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- product_id: string (nullable = true)
-- product_name: string (nullable = true)
-- price: long (nullable = true)
-- category: string (nullable = true)
-- updated_at: double (nullable = true)


Data schema:
root
-- product_id: string (nullable = true)
-- product_name: string (nullable = true)
-- price: long (nullable = true)
-- category: string (nullable = true)
-- updated_at: double (nullable = true)
-- one_dummy_column: integer (nullable = true)

         


In [5]:
df2.write.format("delta").mode("append").option('mergeSchema',True).option("path","s3://deltasparktesting/iris_data_delta_5/").saveAsTable(f"deltalake_db.delta_table_5")




# Alternative to automatic Schema Evolution

When we use .option('mergeSchema',True) to modify the behavior of the Delta Lake write,it comes at the price of our not being fully aware of the changes to our table schema.

In [6]:
df=spark.read.format('delta').load('s3://deltasparktesting/iris_data_delta_5/')
df.show()

+----------+------------+-----+-----------+--------------------+----------------+
|product_id|product_name|price|   category|          updated_at|one_dummy_column|
+----------+------------+-----+-----------+--------------------+----------------+
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|               5|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|               5|
|     00003|  Television|  600|Electronics|1.7390275267700424E9|               5|
|     00004|     Blender|  100|Electronics|1.7390275267700424E9|               5|
|     00001|      Heater|  250|Electronics|1.7390275267700424E9|               5|
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|            null|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|            null|
|     00003|  Television|  600|Electronics|1.7390275267700424E9|            null|
|     00004|     Blender|  100|Electronics|1.7390275267700424E9|            null|
|     00001|    

In [7]:
df2=df.withColumn('another_dummy_column',lit(10))
df2.show()

+----------+------------+-----+-----------+--------------------+----------------+--------------------+
|product_id|product_name|price|   category|          updated_at|one_dummy_column|another_dummy_column|
+----------+------------+-----+-----------+--------------------+----------------+--------------------+
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|               5|                  10|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|               5|                  10|
|     00003|  Television|  600|Electronics|1.7390275267700424E9|               5|                  10|
|     00004|     Blender|  100|Electronics|1.7390275267700424E9|               5|                  10|
|     00001|      Heater|  250|Electronics|1.7390275267700424E9|               5|                  10|
|     00005| USB charger|   50|Electronics|1.7390275267700424E9|            null|                  10|
|     00002|  Thermostat|  400|Electronics|1.7390275267700424E9|         

In [8]:
df2.write.format("delta").mode("append").option("path","s3://deltasparktesting/iris_data_delta_5/").saveAsTable(f"deltalake_db.delta_table_5")

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: ec4bbf00-e9bc-43b4-b0d0-31865c183802).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- product_id: string (nullable = true)
-- product_name: string (nullable = true)
-- price: long (nullable = true)
-- category: string (nullable = true)
-- updated_at: double (nullable = true)
-- one_dummy_column: integer (nullable = true)


Data schema:
root
-- product_id: string (nullable = true)
-- product_name: string (nullable = true)
-- price: long (nullable = true)
-- category: string (nullable = true)
-- updated_at: double (nullable = true)
-- one_dummy_column: integer (nullable = true)
-- another_dummy_column: integer (nullable = true)

         


In [9]:
spark.sql('Alter table deltalake_db.delta_table_5 add columns (another_dummy_column Int);')

DataFrame[]


In [10]:
df2.write.format("delta").mode("append").option("path","s3://deltasparktesting/iris_data_delta_5/").saveAsTable(f"deltalake_db.delta_table_5")




# Restoring Your Table

In [1]:
from delta.tables import *
dt = DeltaTable.forPath(spark, "s3://deltasparktesting/iris_data_delta_5/")

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: a11cda89-2578-42f8-b492-a199d04ad8fd
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session a11cda89-2578-42f8-b492-a199d04ad8fd to get into ready status...
Session a11cda89-2578-42f8-b492-a199d04ad8fd has been created.



In [2]:
dt.history().show()

+-------+-------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|userId|userName|           operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+------+--------+--------------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      3|2025-02-08 15:22:23|  null|    null|               WRITE|{mode -> Append, ...|null|    null|     null|          2|  Serializable|        false|{numFiles -> 10, ...|        null|Apache-Spark/3.3....|
|      2|2025-02-08 15:21:54|  null|    null|         ADD COLUMNS|{columns -> [{"co...|null|    null|     null|          1|  Serializable|         true|                

In [4]:
dt.restoreToVersion(0)

DataFrame[table_size_after_restore: bigint, num_of_files_after_restore: bigint, num_removed_files: bigint, num_restored_files: bigint, removed_files_size: bigint, restored_files_size: bigint]


# Parquet to Delta Conversion

In [1]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("parquet").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_7/").saveAsTable(f"deltalake_db.delta_table_7")

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 4de62d3a-542f-4a2f-b348-d802d5d8a2da
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 4de62d3a-542f-4a2f-b348-d802d5d8a2da to get into ready status...
Session 4de62d3a-542f-4a2f-b348-d802d5d8a2da has been created.
+----------+------------+-----+-----------+-------------------+
|product_id|product_name|price|   category|         updated_at|
+----------+------------+-----+-----------+-------------------+
|     00001|      Heater|  250|Electronics|1.739265821850448E9|
|     00002|  Thermostat|

# Since a Delta Lake table composed of Prquet file internally, the transaction log is the biggest difference when converting a Parquet table to a Delta Lake.

In [1]:
%%sql
convert to delta deltalake_db.delta_table_7;

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 54d65640-6ad3-4b22-b337-78a7a2adf5dc
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 54d65640-6ad3-4b22-b337-78a7a2adf5dc to get into ready status...
Session 54d65640-6ad3-4b22-b337-78a7a2adf5dc has been created.
++
||
++
++


# Convert from Parquet to Delta using s3 path

In [2]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("parquet").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_8/").saveAsTable(f"deltalake_db.delta_table_8")

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7392662504966307E9|
|     00002|  Thermostat|  400|Electronics|1.7392662504966307E9|
|     00003|  Television|  600|Electronics|1.7392662504966307E9|
|     00004|     Blender|  100|Electronics|1.7392662504966307E9|
|     00005| USB charger|   50|Electronics|1.7392662504966307E9|
+----------+------------+-----+-----------+--------------------+


In [3]:
%%sql
CONVERT TO DELTA parquet.`s3://deltasparktesting/iris_data_delta_8/`

++
||
++
++


# Converting partitioned parquet table to Delta

In [4]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

df=spark.read.format('csv').option('header',True).option('inferSchema',True).load('s3://deltasparktesting/iris_data/');
df.show()
df.write.format("parquet").mode("overwrite").partitionBy('CLASS_NAME').option("path","s3://deltasparktesting/iris_data_delta_9/").saveAsTable(f"deltalake_db.delta_table_9")

+---+------------+-----------+------------+-----------+---------------+
| Id|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|     CLASS_NAME|
+---+------------+-----------+------------+-----------+---------------+
|  1|         7.0|        3.2|         4.7|        1.4|Iris-versicolor|
|  2|         6.4|        3.2|         4.5|        1.5|Iris-versicolor|
|  3|         6.9|        3.1|         4.9|        1.5|Iris-versicolor|
|  4|         5.5|        2.3|         4.0|        1.3|Iris-versicolor|
|  5|         6.5|        2.8|         4.6|        1.5|Iris-versicolor|
|  6|         5.7|        2.8|         4.5|        1.3|Iris-versicolor|
|  7|         6.3|        3.3|         4.7|        1.6|Iris-versicolor|
|  8|         4.9|        2.4|         3.3|        1.0|Iris-versicolor|
|  9|         6.6|        2.9|         4.6|        1.3|Iris-versicolor|
| 10|         5.2|        2.7|         3.9|        1.4|Iris-versicolor|
| 11|         5.0|        2.0|         3.5|        1.0|Iris-vers

In [5]:
%%sql
convert to delta deltalake_db.delta_table_9;

++
||
++
++


In [6]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

df=spark.read.format('csv').option('header',True).option('inferSchema',True).load('s3://deltasparktesting/iris_data/');
df.show()
df.write.format("parquet").mode("overwrite").partitionBy('CLASS_NAME').option("path","s3://deltasparktesting/iris_data_delta_10/").saveAsTable(f"deltalake_db.delta_table_10")

+---+------------+-----------+------------+-----------+---------------+
| Id|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|     CLASS_NAME|
+---+------------+-----------+------------+-----------+---------------+
|  1|         7.0|        3.2|         4.7|        1.4|Iris-versicolor|
|  2|         6.4|        3.2|         4.5|        1.5|Iris-versicolor|
|  3|         6.9|        3.1|         4.9|        1.5|Iris-versicolor|
|  4|         5.5|        2.3|         4.0|        1.3|Iris-versicolor|
|  5|         6.5|        2.8|         4.6|        1.5|Iris-versicolor|
|  6|         5.7|        2.8|         4.5|        1.3|Iris-versicolor|
|  7|         6.3|        3.3|         4.7|        1.6|Iris-versicolor|
|  8|         4.9|        2.4|         3.3|        1.0|Iris-versicolor|
|  9|         6.6|        2.9|         4.6|        1.3|Iris-versicolor|
| 10|         5.2|        2.7|         3.9|        1.4|Iris-versicolor|
| 11|         5.0|        2.0|         3.5|        1.0|Iris-vers

In [8]:
%%sql
CONVERT TO DELTA parquet.`s3://deltasparktesting/iris_data_delta_10/`  PARTITIONED BY (CLASS_NAME String);

++
||
++
++


# Vacuum Command

Delta Lake doesn't physically remove files from storage for operations that logically delete the files (not contributing to the latest version of Delta Lake). You need to use the vacuum command to physically remove files from storage that have been marked for deletion and are older than the retention period.

The main benefit of vacuuming is to save on storage costs. Vacuuming does not make your queries run any faster and can limit your ability to time travel to earlier Delta table versions. You need to weigh the costs/benefits for each of your tables to develop an optimal vacuum strategy.

In [1]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("delta").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_6/").saveAsTable(f"deltalake_db.delta_table_6")

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 44b7bfce-3ece-4143-af7f-6a70069eb207
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 44b7bfce-3ece-4143-af7f-6a70069eb207 to get into ready status...
Session 44b7bfce-3ece-4143-af7f-6a70069eb207 has been created.
+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7393566207408683E9|
|     00002|  Thermostat|  400|Electronics|1.7393566207408683E9|
|     00003|  Television|  600|Electronics|1.7393566207408683E9|
|     00004|     Blender|  100|Electronics|1.

In [3]:
df=spark.read.format('delta').load('s3://deltasparktesting/iris_data_delta_6/')
df.filter(df['product_id']=='00005').select(input_file_name()).show(truncate=False)

+------------------------------------------------------------------------------------------------------------+
|input_file_name()                                                                                           |
+------------------------------------------------------------------------------------------------------------+
|s3://deltasparktesting/iris_data_delta_6/part-00015-443ddce7-3d14-43d2-bc19-d84d39287d15-c000.snappy.parquet|
+------------------------------------------------------------------------------------------------------------+


In [4]:
%%sql
delete from deltalake_db.delta_table_6 where product_id ='00005'

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [5]:
spark.sql("VACUUM deltalake_db.delta_table_6 DRY RUN").show(truncate=False)

+----+
|path|
+----+
+----+


The vacuum command won't actually delete any files because the tombstoned files aren't older than the retention period, which is 7 days by default

In [7]:
spark.sql("VACUUM deltalake_db.delta_table_6 RETAIN 0 HOURS DRY RUN").show(truncate=False)

IllegalArgumentException: requirement failed: Are you sure you would like to vacuum files with such a low retention period? If you have
writers that are currently writing to this table, there is a risk that you may corrupt the
state of your Delta table.

If you are certain that there are no operations being performed on this table, such as
insert/upsert/delete/optimize, then you may turn off this check by setting:
spark.databricks.delta.retentionDurationCheck.enabled = false

If you are not sure, please use a value not less than "168 hours".
       


In [8]:
spark.conf.get("spark.databricks.delta.retentionDurationCheck.enabled")

'true'


In [9]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")




In [11]:
spark.sql("VACUUM deltalake_db.delta_table_6 RETAIN 0 HOURS").show(truncate=False)

+----------------------------------------+
|path                                    |
+----------------------------------------+
|s3://deltasparktesting/iris_data_delta_6|
+----------------------------------------+


In [12]:
spark.sql(f"""DESCRIBE HISTORY deltalake_db.delta_table_6""").show(truncate=False)

+-------+-------------------+------+--------+---------------------------------+----------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId|userName|operation                        |operationParameters                                                               |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                |userMetadata|engineInfo                                |
+-------+-------------------+------+--------+---------------------------------

In [13]:
df_previous = spark.read.format("delta").option("versionAsof", 0).load('s3://deltasparktesting/iris_data_delta_6/')
df_previous.show()

Py4JJavaError: An error occurred while calling o129.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 119.0 failed 4 times, most recent failure: Lost task 0.3 in stage 119.0 (TID 50775) (172.35.186.38 executor 2): org.apache.spark.sql.execution.datasources.FileDownloadException: Failed to download file path: s3://deltasparktesting/iris_data_delta_6/part-00015-443ddce7-3d14-43d2-bc19-d84d39287d15-c000.snappy.parquet, range: 0-1589, partition values: [empty row], isDataPresent: false, eTag: null
	at org.apache.spark.sql.execution.datasources.AsyncFileDownloader.next(AsyncFileDownloader.scala:142)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.getNextFile(FileScanRDD.scala:423)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:337)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:227)
	at org.apache.spark.sql.execution.FileSourceSca

# V.V.I. Conclusion : If you run VACUUM on a Delta table, you lose the ability to time travel back to a version older than the specified data retention period.

# Remember to never remove Delta Lake table data files outside the context of the Delta Lake operations, as doing so can corrupt your tables

vacuum command will not run itself. To keep your prod Delta Tables tidy, you can setup a corn job to call vacuum command

delta.deletedFileRetentionDuration = "interval ": controls how long ago a file must have been deleted before being a candidate for VACUUM

In [None]:
%stop_session

In [1]:
# create table in metastore
query = f"""
CREATE TABLE IF NOT EXISTS deltalake_db.delta_table_11 (
  ID INT,
  SEPAL_LENGTH double,
  SEPAL_WIDTH double,
  PETAL_LENGTH double,
  PETAL_WIDTH double,
  CLASS_NAME STRING
)
USING delta
LOCATION 's3://deltasparktesting/delta_table_11/'
TBLPROPERTIES (
    'delta.deletedFileRetentionDuration' = 'interval 0 days'
);
"""

spark.sql(query)

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: native-delta-sql--ce361057-65cf-4290-ac87-695e5ee0ea40
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session native-delta-sql--ce361057-65cf-4290-ac87-695e5ee0ea40 to get into ready status...
Session native-delta-sql--ce361057-65cf-4290-ac87-695e5ee0ea40 has been created.
DataFrame[]


In [3]:
%%sql
insert into deltalake_db.delta_table_11
select * from deltalake_db.csv_table;

++
||
++
++


In [4]:
%%sql # Read table from metastore
Delete FROM deltalake_db.delta_table_11 where CLASS_NAME='Iris-versicolor';

+-----------------+
|num_affected_rows|
+-----------------+
|               51|
+-----------------+


In [5]:
%%sql
VACUUM deltalake_db.delta_table_11 DRY RUN

+--------------------+
|                path|
+--------------------+
|s3://deltasparkte...|
+--------------------+


In [6]:
%%sql
VACUUM deltalake_db.delta_table_11

+--------------------+
|                path|
+--------------------+
|s3://deltasparkte...|
+--------------------+


# Scaling Massive Metadata

Now that we have seen how transaction log records each operation, we can have many very large files with thousands of transaction log entries for a single DeltaLake Table. 

How DeltaLake scale it's metadata handling wihtout needing to read thousands of small files, which would negatively impact Spark's reading performace?

To alleviate this issue, Delta Lake creates a checkpoint file in Parquet format after it creates (by default) the 10th commit (i.e. transaction). 

These checkpoint files save the entire state of the table at a point in time – in native Parquet format that is quick and easy for any engine to read. It offers the reader a “shortcut” to fully reproducing a table’s state to avoid reprocessing what could be thousands of tiny, inefficient JSON files.

https://www.databricks.com/wp-content/uploads/2019/08/image6-1.png

https://www.databricks.com/wp-content/uploads/2019/08/image2-3.png

In [12]:
from pyspark.sql import Row
from pyspark.sql.functions import *
import time

ut = time.time()

product = [
    {'product_id': '00001', 'product_name': 'Heater', 'price': 250, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00002', 'product_name': 'Thermostat', 'price': 400, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00003', 'product_name': 'Television', 'price': 600, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00004', 'product_name': 'Blender', 'price': 100, 'category': 'Electronics', 'updated_at': ut},
    {'product_id': '00005', 'product_name': 'USB charger', 'price': 50, 'category': 'Electronics', 'updated_at': ut}
]

df_products = spark.createDataFrame(Row(**x) for x in product)
df_products.show()
df_products.write.format("delta").mode("overwrite").option("path","s3://deltasparktesting/iris_data_delta_13/").saveAsTable(f"deltalake_db.delta_table_13")

+----------+------------+-----+-----------+--------------------+
|product_id|product_name|price|   category|          updated_at|
+----------+------------+-----+-----------+--------------------+
|     00001|      Heater|  250|Electronics|1.7402074193605444E9|
|     00002|  Thermostat|  400|Electronics|1.7402074193605444E9|
|     00003|  Television|  600|Electronics|1.7402074193605444E9|
|     00004|     Blender|  100|Electronics|1.7402074193605444E9|
|     00005| USB charger|   50|Electronics|1.7402074193605444E9|
+----------+------------+-----+-----------+--------------------+


In [13]:
%%sql
UPDATE deltalake_db.delta_table_13 
SET product_name = 'Cooker' 
WHERE product_id = '00004';

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [14]:
%%sql
UPDATE deltalake_db.delta_table_13 
SET price = price * 1.1;

+-----------------+
|num_affected_rows|
+-----------------+
|                5|
+-----------------+


In [15]:
%%sql
UPDATE deltalake_db.delta_table_13 
SET category = 'Accessories' 
WHERE product_id = '00005';

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [16]:
%%sql
INSERT INTO deltalake_db.delta_table_13 
VALUES ('00006', 'Smartphone', 800, 'Electronics', current_timestamp());

++
||
++
++


In [17]:
%%sql
INSERT INTO deltalake_db.delta_table_13 
VALUES ('00007', 'Laptop', 1200, 'Electronics', current_timestamp());

++
||
++
++


In [18]:
%%sql
DELETE FROM deltalake_db.delta_table_13 
WHERE product_id = '00004';

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [19]:
%%sql
UPDATE deltalake_db.delta_table_13 
SET price = price * 0.95 
WHERE category = 'Electronics';

+-----------------+
|num_affected_rows|
+-----------------+
|                5|
+-----------------+


In [20]:
%%sql
ALTER TABLE deltalake_db.delta_table_13 
ADD COLUMNS (discount DOUBLE);

++
||
++
++


In [21]:
%%sql
UPDATE deltalake_db.delta_table_13 SET discount = 0.10;

+-----------------+
|num_affected_rows|
+-----------------+
|                6|
+-----------------+


In [22]:
%%sql
UPDATE deltalake_db.delta_table_13 
SET price = price * (1 - discount);

+-----------------+
|num_affected_rows|
+-----------------+
|                6|
+-----------------+


In [23]:
checkpoint_df=spark.read.format('parquet').load('s3://deltasparktesting/iris_data_delta_13/_delta_log/00000000000000000010.checkpoint.parquet')




In [24]:
checkpoint_df.show(truncate=False)

+----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+--------+--------+
|txn |add                                                                                                                                                                                                                                                                                                                                           

# Controlling the checkpoint interval

In [25]:
spark.sql("""Create Table deltalake_db.delta_table_14 (
  product_id String,
  product_name STRING,
  price Int,
  category STRING,
  updated_at Int
)  using delta LOCATION 's3://deltasparktesting/iris_data_delta_14/' TBLPROPERTIES ('delta.checkpointInterval'=5)""")

DataFrame[]


In [26]:
%%sql
INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at)  VALUES ('00001', 'Heater', 250, 'Electronics', 1700000000);

++
||
++
++


In [27]:
%%sql
INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at) 
VALUES ('00002', 'Thermostat', 400, 'Electronics', 1700000000);

++
||
++
++


In [28]:
%%sql

INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at) 
VALUES ('00003', 'Television', 600, 'Electronics', 1700000000);

++
||
++
++


In [29]:
%%sql

INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at) 
VALUES ('00004', 'Blender', 100, 'Electronics', 1700000000);

++
||
++
++


In [30]:
%%sql

INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at) 
VALUES ('00005', 'USB charger', 50, 'Electronics', 1700000000);

++
||
++
++


In [None]:
%%sql
INSERT INTO deltalake_db.delta_table_14 (product_id, product_name, price, category, updated_at) 
VALUES ('00005', 'USB charger', 50, 'Electronics', 1700000000);

# Change Data Feed

In [1]:
%%sql
CREATE TABLE sales_data_demo_yt (
    id INT,
    product STRING,
    quantity INT,
    price DOUBLE
) USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
Location 's3://deltasparktesting/incremental_test_yt/';

Trying to create a Glue session for the kernel.
Session Type: glueetl
Idle Timeout: 60
Session ID: 47e2e12b-de33-4a54-ae34-e96cda9829fb
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
Waiting for session 47e2e12b-de33-4a54-ae34-e96cda9829fb to get into ready status...
Session 47e2e12b-de33-4a54-ae34-e96cda9829fb has been created.
++
||
++
++


In [2]:
%%sql
select * from sales_data_demo_yt;

+---+-------+--------+-----+
| id|product|quantity|price|
+---+-------+--------+-----+
+---+-------+--------+-----+


In [3]:
%%sql
INSERT INTO sales_data_demo_yt VALUES 
(1, 'Laptop', 5, 1200.00),
(2, 'Phone', 10, 800.00);

++
||
++
++


In [4]:
df = spark.read.format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingTimestamp", "1970-01-01 00:00:00") \
    .load('s3://deltasparktesting/incremental_test_yt/')
df.show()

+---+-------+--------+------+------------+---------------+-------------------+
| id|product|quantity| price|_change_type|_commit_version|  _commit_timestamp|
+---+-------+--------+------+------------+---------------+-------------------+
|  1| Laptop|       5|1200.0|      insert|              1|2025-02-27 04:32:18|
|  2|  Phone|      10| 800.0|      insert|              1|2025-02-27 04:32:18|
+---+-------+--------+------+------------+---------------+-------------------+


In [5]:
%%sql
select * from sales_data_demo_yt;

+---+-------+--------+------+
| id|product|quantity| price|
+---+-------+--------+------+
|  1| Laptop|       5|1200.0|
|  2|  Phone|      10| 800.0|
+---+-------+--------+------+


In [6]:
spark.sql("select current_timestamp()").show(truncate=False)

+-----------------------+
|current_timestamp()    |
+-----------------------+
|2025-02-27 04:34:52.505|
+-----------------------+


In [7]:
%%sql
UPDATE sales_data_demo_yt SET price = 1100.00 WHERE id = 1;

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [8]:
%%sql
select * from sales_data_demo_yt;

+---+-------+--------+------+
| id|product|quantity| price|
+---+-------+--------+------+
|  1| Laptop|       5|1100.0|
|  2|  Phone|      10| 800.0|
+---+-------+--------+------+


In [9]:
df = spark.read.format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingTimestamp", "2025-02-27 04:34:52.505") \
    .load('s3://deltasparktesting/incremental_test_yt/')
df.show()

+---+-------+--------+------+----------------+---------------+-------------------+
| id|product|quantity| price|    _change_type|_commit_version|  _commit_timestamp|
+---+-------+--------+------+----------------+---------------+-------------------+
|  1| Laptop|       5|1200.0| update_preimage|              2|2025-02-27 04:35:23|
|  1| Laptop|       5|1100.0|update_postimage|              2|2025-02-27 04:35:23|
+---+-------+--------+------+----------------+---------------+-------------------+


In [10]:
spark.sql("select current_timestamp()").show(truncate=False)

+-----------------------+
|current_timestamp()    |
+-----------------------+
|2025-02-27 04:38:29.683|
+-----------------------+


In [11]:
%%sql
DELETE FROM sales_data_demo_yt WHERE id = 2;

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+


In [12]:
%%sql
select * from sales_data_demo_yt;

+---+-------+--------+------+
| id|product|quantity| price|
+---+-------+--------+------+
|  1| Laptop|       5|1100.0|
+---+-------+--------+------+


In [13]:
df = spark.read.format("delta") \
    .option("readChangeFeed", "true") \
    .option("startingTimestamp", "2025-02-27 04:38:29.683") \
    .load('s3://deltasparktesting/incremental_test_yt/')
df.show()

+---+-------+--------+-----+------------+---------------+-------------------+
| id|product|quantity|price|_change_type|_commit_version|  _commit_timestamp|
+---+-------+--------+-----+------------+---------------+-------------------+
|  2|  Phone|      10|800.0|      delete|              3|2025-02-27 04:38:39|
+---+-------+--------+-----+------------+---------------+-------------------+


In [14]:
df=spark.read.format('parquet').load('s3://deltasparktesting/incremental_test_yt/_change_data/');
df.show(20,truncate=False)

+---+-------+--------+------+----------------+
|id |product|quantity|price |_change_type    |
+---+-------+--------+------+----------------+
|1  |Laptop |5       |1200.0|update_preimage |
|1  |Laptop |5       |1100.0|update_postimage|
|2  |Phone  |10      |800.0 |delete          |
+---+-------+--------+------+----------------+
