In [1]:
import pandas as pd

In [2]:
from pyspark.sql import SparkSession, functions as F

In [3]:
spark = (SparkSession.builder
         .appName("Delta Lake Basics")
         .master("local[2]")
         .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
         .getOrCreate())

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-85d77be7-3b96-4ae5-8a5b-c6e49749d947;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 519ms :: artifacts dl 26ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |  

In [4]:
# We must import delta libs after SparkSession created

from delta.tables import *

## Create a spark dataframe

In [5]:
customers = spark.createDataFrame([(1416, "Tuncay Avcı", "İnsan Kaynakları"), 
                                   (1417, "İsmail İçtüzer", "Üretim Planlama"),
                                   (1506, "Mustafa Aslan", "Halkla İlişkiler"),
                                   (1527, "Buket Durmuş", "Pazarlama"),
                                   (1525, "Saadet Yılmaz", "Teknik Koordinasyon")],
                                   ["Id","Name","Branch"])

In [6]:
customers.show()

                                                                                

+----+--------------+-------------------+
|  Id|          Name|             Branch|
+----+--------------+-------------------+
|1416|   Tuncay Avcı|   İnsan Kaynakları|
|1417|İsmail İçtüzer|    Üretim Planlama|
|1506| Mustafa Aslan|   Halkla İlişkiler|
|1527|  Buket Durmuş|          Pazarlama|
|1525| Saadet Yılmaz|Teknik Koordinasyon|
+----+--------------+-------------------+



## Write dataframe to deltalake

In [7]:
! rm -rf /opt/examples/datasets/delta/ik_delta

In [8]:
deltaPath = "file:///opt/examples/datasets/delta/ik_delta"

In [9]:
customers.write \
.mode("overwrite") \
.format("delta") \
.save(deltaPath)

                                                                                

In [10]:
! ls -l /opt/examples/datasets/delta/ik_delta

total 8
drwxr-xr-x. 3 root root   93 Nov  5 04:23 _delta_log
-rw-r--r--. 1 root root 1099 Nov  5 04:23 part-00000-46a67d72-745a-423b-b97d-0694c16cd15f-c000.snappy.parquet
-rw-r--r--. 1 root root 1139 Nov  5 04:23 part-00001-41180f96-00c6-40e1-b1dc-8b6bf1a80b8e-c000.snappy.parquet


In [12]:
! ls -l /opt/examples/datasets/delta/ik_delta

total 8
drwxr-xr-x. 3 root root   93 Nov  5 04:23 _delta_log
-rw-r--r--. 1 root root 1099 Nov  5 04:23 part-00000-46a67d72-745a-423b-b97d-0694c16cd15f-c000.snappy.parquet
-rw-r--r--. 1 root root 1139 Nov  5 04:23 part-00001-41180f96-00c6-40e1-b1dc-8b6bf1a80b8e-c000.snappy.parquet


In [13]:
! cat  /opt/examples/datasets/delta/ik_delta/_delta_log/00000000000000000000.json

{"commitInfo":{"timestamp":1730780598222,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"2","numOutputRows":"5","numOutputBytes":"2238"},"engineInfo":"Apache-Spark/3.5.3 Delta-Lake/3.2.0","txnId":"08db501c-aa50-472a-854d-9bd4515fcdc9"}}
{"metaData":{"id":"d0dea724-b437-457a-9296-c14729413873","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"Id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Branch\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1730780590331}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"add":{"path":"part-00000-46a67d72-745a-423b-b97d-0694c16cd15f-c000.snappy.parquet","partitionValues":{},"size":1099,"modificationTime":1730

## Read From Deltalake as DeltaTable

In [14]:
customers_delta = DeltaTable.forPath(spark, deltaPath)

In [15]:
type(customers_delta)

delta.tables.DeltaTable

In [16]:
customers_delta.toDF().show()

24/11/05 04:23:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----+--------------+-------------------+
|  Id|          Name|             Branch|
+----+--------------+-------------------+
|1506| Mustafa Aslan|   Halkla İlişkiler|
|1527|  Buket Durmuş|          Pazarlama|
|1525| Saadet Yılmaz|Teknik Koordinasyon|
|1416|   Tuncay Avcı|   İnsan Kaynakları|
|1417|İsmail İçtüzer|    Üretim Planlama|
+----+--------------+-------------------+



In [17]:
! ls -l /opt/examples/datasets/delta/ik_delta/_delta_log

total 4
-rw-r--r--. 1 root root 1717 Nov  5 04:23 00000000000000000000.json
drwxr-xr-x. 2 root root    6 Nov  5 04:23 _commits


## New persons

                                    ([(1416, "Tuncay Avcı", "İnsan Kaynakları"), u
                                   (1417, "İsmail İçtüzer", "Üretim Planlama"), u
                                   (1506, "Mustafa Aslan", "Halkla İlişkiler"),
                                   (1527, "Buket Durmuş", "Pazarlama"),
                                   (1525, "Saadet Yılmaz", "Teknik Koordinasyon")], u
                                   ["Id","Name","Branch"])

In [19]:
customers_new = spark.createDataFrame([(1416, "Tuncay Avcı", "Genel Koordinasyon"), 
                                   (1417, "İsmail İçtüzer", "Genel Koordinasyon"),
                                   (1508, "Mustafa Bankur", "Üretim"),
                                   (1522, "Meliha Kaya", "Pazarlama"),
                                   (1525, "Saadet Yılmaz Kaya", "Teknik Koordinasyon")],
                                   ["Id","Name","Branch"])

In [20]:
customers_new.show()

                                                                                

+----+------------------+-------------------+
|  Id|              Name|             Branch|
+----+------------------+-------------------+
|1416|       Tuncay Avcı| Genel Koordinasyon|
|1417|    İsmail İçtüzer| Genel Koordinasyon|
|1508|    Mustafa Bankur|             Üretim|
|1522|       Meliha Kaya|          Pazarlama|
|1525|Saadet Yılmaz Kaya|Teknik Koordinasyon|
+----+------------------+-------------------+



## Upsert new customers into delta table

In [21]:
customers_delta.alias("cust") \
.merge(customers_new.alias("cust_new"), "cust.Id = cust_new.Id") \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

                                                                                

In [22]:
customers_delta.toDF().orderBy("Id").show()

                                                                                

+----+------------------+-------------------+
|  Id|              Name|             Branch|
+----+------------------+-------------------+
|1416|       Tuncay Avcı| Genel Koordinasyon|
|1417|    İsmail İçtüzer| Genel Koordinasyon|
|1506|     Mustafa Aslan|   Halkla İlişkiler|
|1508|    Mustafa Bankur|             Üretim|
|1522|       Meliha Kaya|          Pazarlama|
|1525|Saadet Yılmaz Kaya|Teknik Koordinasyon|
|1527|      Buket Durmuş|          Pazarlama|
+----+------------------+-------------------+



In [23]:
! hdfs dfs -ls /opt/examples/datasets/delta/ik_delta/_delta_log

/usr/bin/sh: 1: hdfs: not found


In [24]:
! cat /opt/examples/datasets/delta/ik_delta/_delta_log/00000000000000000001.json

{"commitInfo":{"timestamp":1730780686663,"operation":"MERGE","operationParameters":{"predicate":"[\"(Id#313L = Id#1228L)\"]","matchedPredicates":"[{\"actionType\":\"update\"}]","notMatchedPredicates":"[{\"actionType\":\"insert\"}]","notMatchedBySourcePredicates":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numTargetRowsCopied":"2","numTargetRowsDeleted":"0","numTargetFilesAdded":"1","numTargetBytesAdded":"1288","numTargetBytesRemoved":"2238","numTargetDeletionVectorsAdded":"0","numTargetRowsMatchedUpdated":"3","executionTimeMs":"5820","numTargetRowsInserted":"2","numTargetRowsMatchedDeleted":"0","numTargetDeletionVectorsUpdated":"0","scanTimeMs":"3636","numTargetRowsUpdated":"3","numOutputRows":"7","numTargetDeletionVectorsRemoved":"0","numTargetRowsNotMatchedBySourceUpdated":"0","numTargetChangeFilesAdded":"0","numSourceRows":"5","numTargetFilesRemoved":"2","numTargetRowsNotMatchedBySourceDeleted":"0","rewriteTimeMs":"1357"},"engineI

# Upsert with Multiple Conditions

In [25]:
customers_new2 = spark.createDataFrame([(1520, "Mustafa Gökçe", "Genel Koordinasyon"), 
                                       (1522, "Meliha Kaya", "Satış"),
                                        (1525, "Saadet Yılmaz", "Genel Koordinasyon"),
                                        (1416, "Tuncay Döner", "Genel Koordinasyon")],
                                   ["Id","Name","Branch"])

In [26]:
customers_delta.alias("cust") \
.merge(customers_new2.alias("cust_new2"), ("cust.Id = cust_new2.Id AND cust_new2.Branch = 'Genel Koordinasyon'") ) \
.whenMatchedUpdateAll() \
.whenNotMatchedInsertAll() \
.execute()

                                                                                

In [27]:
customers_delta.toDF().show()



+----+--------------+------------------+
|  Id|          Name|            Branch|
+----+--------------+------------------+
|1416|  Tuncay Döner|Genel Koordinasyon|
|1417|İsmail İçtüzer|Genel Koordinasyon|
|1506| Mustafa Aslan|  Halkla İlişkiler|
|1508|Mustafa Bankur|            Üretim|
|1520| Mustafa Gökçe|Genel Koordinasyon|
|1522|   Meliha Kaya|             Satış|
|1522|   Meliha Kaya|         Pazarlama|
|1525| Saadet Yılmaz|Genel Koordinasyon|
|1527|  Buket Durmuş|         Pazarlama|
+----+--------------+------------------+



                                                                                

In [29]:
! ls -l /opt/examples/datasets/delta/ik_delta/_delta_log

total 12
-rw-r--r--. 1 root root 1717 Nov  5 04:23 00000000000000000000.json
-rw-r--r--. 1 root root 1977 Nov  5 04:24 00000000000000000001.json
-rw-r--r--. 1 root root 1786 Nov  5 04:25 00000000000000000002.json
drwxr-xr-x. 2 root root    6 Nov  5 04:23 _commits


# Update

In [30]:
customers_delta.update(
condition=F.expr("Id == 1527"),
    set={"Name": F.expr("'Buket Durmuş Çetinkaya'"), "Branch": F.expr("'Teknoloji'")}
)

                                                                                

In [31]:
customers_delta.toDF().show(truncate=False)

                                                                                

+----+----------------------+------------------+
|Id  |Name                  |Branch            |
+----+----------------------+------------------+
|1416|Tuncay Döner          |Genel Koordinasyon|
|1417|İsmail İçtüzer        |Genel Koordinasyon|
|1506|Mustafa Aslan         |Halkla İlişkiler  |
|1508|Mustafa Bankur        |Üretim            |
|1520|Mustafa Gökçe         |Genel Koordinasyon|
|1522|Meliha Kaya           |Satış             |
|1522|Meliha Kaya           |Pazarlama         |
|1525|Saadet Yılmaz         |Genel Koordinasyon|
|1527|Buket Durmuş Çetinkaya|Teknoloji         |
+----+----------------------+------------------+



In [32]:
! ls -l /opt/examples/datasets/delta/ik_delta/_delta_log

total 16
-rw-r--r--. 1 root root 1717 Nov  5 04:23 00000000000000000000.json
-rw-r--r--. 1 root root 1977 Nov  5 04:24 00000000000000000001.json
-rw-r--r--. 1 root root 1786 Nov  5 04:25 00000000000000000002.json
-rw-r--r--. 1 root root 1267 Nov  5 04:25 00000000000000000003.json
drwxr-xr-x. 2 root root    6 Nov  5 04:23 _commits


# Delete

In [33]:
customers_delta.delete(F.col("Id") == 1506)

                                                                                

In [34]:
customers_delta.toDF().show(truncate=False)



+----+----------------------+------------------+
|Id  |Name                  |Branch            |
+----+----------------------+------------------+
|1416|Tuncay Döner          |Genel Koordinasyon|
|1417|İsmail İçtüzer        |Genel Koordinasyon|
|1508|Mustafa Bankur        |Üretim            |
|1520|Mustafa Gökçe         |Genel Koordinasyon|
|1522|Meliha Kaya           |Satış             |
|1522|Meliha Kaya           |Pazarlama         |
|1525|Saadet Yılmaz         |Genel Koordinasyon|
|1527|Buket Durmuş Çetinkaya|Teknoloji         |
+----+----------------------+------------------+



                                                                                

In [35]:
! ls -l /opt/examples/datasets/delta/ik_delta/_delta_log

total 20
-rw-r--r--. 1 root root 1717 Nov  5 04:23 00000000000000000000.json
-rw-r--r--. 1 root root 1977 Nov  5 04:24 00000000000000000001.json
-rw-r--r--. 1 root root 1786 Nov  5 04:25 00000000000000000002.json
-rw-r--r--. 1 root root 1267 Nov  5 04:25 00000000000000000003.json
-rw-r--r--. 1 root root 1267 Nov  5 04:26 00000000000000000004.json
drwxr-xr-x. 2 root root    6 Nov  5 04:23 _commits


In [None]:
spark.stop()