In [1]:
from pyspark.sql import SparkSession
from delta.tables import *
from pyspark.sql.functions import *
from delta import *
import pytz as pytz
import datetime as datetime

In [2]:
tw = pytz.timezone('Asia/Taipei')

In [3]:
spark = SparkSession \
  .builder \
  .appName("test Delta Lake") \
  .master("local[4]") \
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0") \
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
  .getOrCreate()

In [4]:
print("start time: ", datetime.datetime.now(tw))

data = spark.range(0, 10)
# data.write.format("delta").save("/tmp/delta-table")
data.write.format("delta").save("delta-table2")

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-31 00:01:32.211749+08:00
end   time:  2022-10-31 00:02:26.590818+08:00


In [5]:
print("start time: ", datetime.datetime.now(tw))

# df = spark.read.format("delta").load("/tmp/delta-table").show()
df = spark.read.format("delta").load("delta-table2").show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-31 00:02:26.615382+08:00
+---+
| id|
+---+
|  7|
|  8|
|  9|
|  2|
|  3|
|  4|
|  5|
|  6|
|  0|
|  1|
+---+

end   time:  2022-10-31 00:02:32.715010+08:00


In [6]:
print("start time: ", datetime.datetime.now(tw))

data = spark.range(10, 20)
# data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
data.write.format("delta").mode("overwrite").save("delta-table2")

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-31 00:02:32.789017+08:00
end   time:  2022-10-31 00:02:46.185900+08:00


In [7]:
print("start time: ", datetime.datetime.now(tw))

# deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")
deltaTable = DeltaTable.forPath(spark, "delta-table2")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-31 00:02:46.247785+08:00
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+

end   time:  2022-10-31 00:03:35.880620+08:00


In [8]:
print("start time: ", datetime.datetime.now(tw))

df = spark.read.format("delta") \
  .option("versionAsOf", 0) \
  .load("delta-table2")

df.show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-31 00:03:35.916154+08:00
+---+
| id|
+---+
|  7|
|  8|
|  9|
|  2|
|  3|
|  4|
|  5|
|  6|
|  0|
|  1|
+---+

end   time:  2022-10-31 00:03:43.703724+08:00
