# 1. Save the path to the directory where delta files will be stored.

In [1]:
# configure path variable
path = "/delta/table"

StatementMeta(cassharedspark, 30, 1, Finished, Available)



# 2. Read in data. Write to delta format and save to the directory above. 

In [4]:
# read in data
data = spark.read.format('csv').options(header='true', inferSchema='true', multiline='true').load('abfss://public-data@statsconviddsinternal.dfs.core.windows.net/incoming/data_duplicate.csv')
# write to delta format
data.write.format("delta").mode("overwrite").save(path)

StatementMeta(cassharedspark, 30, 4, Finished, Available)



# 3. Optional: Create an SQL Table Using Delta.

This is necessary only if you want to use SQL to query the delta table. If you are only using Python, Scala, or C#, skip this step.

Note: The current version of Delta Lake included in Synapse does not actually support SQL. **Some SELECT statements work**; however, other statements (notably UPDATE and the Time Travel feature) will result in an error. **Supported languages include PySpark, Scala, and .NET (C#).**

In [6]:
# create an SQL table
spark.sql("CREATE TABLE delta.example USING DELTA LOCATION '{0}'".format(path))

StatementMeta(cassharedspark, 30, 6, Finished, Available)

DataFrame[]

# 4. Now you can run queries on the delta table.

In [5]:
# read the delta table as a dataframe
df = spark.read.format("delta").load(path)
display(df) # display data
# you can also use df.show()

StatementMeta(cassharedspark, 30, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, b12d4b30-4cfe-4da7-8a30-4048e52ba109)

In [7]:
%%sql
SELECT * FROM delta.example LIMIT 100

StatementMeta(cassharedspark, 30, 7, Finished, Available)

<Spark SQL result set with 100 rows and 16 fields>

In [9]:
# conditional update
from pyspark.sql.functions import *
from delta.tables import *

delta_table = DeltaTable.forPath(spark, path)

# delete all even numbered rows
delta_table.delete("id % 2 == 0")

# change name of people with id < 10 to opal
delta_table.update("id < 10", { "given_name": "'opal'" } )

StatementMeta(cassharedspark, 30, 9, Finished, Available)



In [11]:
%%sql
SELECT * FROM delta.example WHERE id % 2 == 0

StatementMeta(cassharedspark, 30, 11, Finished, Available)

<Spark SQL result set with 0 rows and 16 fields>

In [12]:
%%sql
SELECT id, given_name FROM delta.example WHERE id < 10

StatementMeta(cassharedspark, 30, 12, Finished, Available)

<Spark SQL result set with 3 rows and 2 fields>

# 5. You can view the update history of a table.

In [13]:
# display history of a table
display(delta_table.history())
# can also use delta_table.history().show()

StatementMeta(cassharedspark, 30, 13, Finished, Available)

SynapseWidget(Synapse.DataFrame, 4f5264fc-3bb5-42b7-b941-885c6eaa39d1)

# 6. You Can Use Time Travel to Query Previous Versions of Your Data

(This example is shown in Scala. You can also use Python or C#.)

In [16]:
%%spark
val path = "/delta/table"
val df = spark.read.format("delta").option("versionAsOf", 0).load(path)
display(df)

StatementMeta(cassharedspark, 30, 17, Finished, Available)

path: String = /delta/table
df: org.apache.spark.sql.DataFrame = [id: int, rec_id: string ... 14 more fields]



SynapseWidget(Synapse.DataFrame, fd135b40-65ba-4b4d-b982-7136bcbb0064)

# 7. Changes to delta tables propagate between Azure services.

In the example below, we use time travel to load data from a delta table previously created in Databricks, which was modified from the original data.

We can also create an SQL table to run queries on the most recent version.

In [19]:
path = "abfss://public-data@statsconviddsinternal.dfs.core.windows.net/delta/"
df = spark.read.format("delta").option("versionAsOf", 4).load(path)
display(df)

StatementMeta(cassharedspark, 30, 20, Finished, Available)

SynapseWidget(Synapse.DataFrame, da7791b8-381b-4172-a757-9def66332949)

In [2]:
spark.sql("CREATE TABLE IF NOT EXISTS delta.example2 USING DELTA LOCATION 'abfss://public-data@statsconviddsinternal.dfs.core.windows.net/delta/'")

StatementMeta(cassharedspark, 27, 2, Finished, Available)

DataFrame[]

In [3]:
%%sql
SELECT * FROM delta.example2 WHERE data_quality_grade == 'B'

StatementMeta(cassharedspark, 27, 3, Finished, Available)

<Spark SQL result set with 1000 rows and 31 fields>