In [38]:
from pyspark.sql import SparkSession
from delta.tables import *
from pyspark.sql.functions import *
from delta import *
import pytz as pytz
import datetime as datetime

In [39]:
tw = pytz.timezone('Asia/Taipei')

In [40]:
spark = SparkSession \
  .builder \
  .appName("test Delta Lake") \
  .master("local[1]") \
  .config("spark.jars.packages", "io.delta:delta-core_2.12:2.0.0") \
  .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
  .getOrCreate()

In [41]:
print("start time: ", datetime.datetime.now(tw))

data = spark.range(0, 10)
# data.write.format("delta").save("/tmp/delta-table")
data.write.format("delta").save("delta-table1")

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:58:09.061035+08:00
end   time:  2022-10-30 23:58:15.369874+08:00


In [42]:
print("start time: ", datetime.datetime.now(tw))

# df = spark.read.format("delta").load("/tmp/delta-table").show()
df = spark.read.format("delta").load("delta-table1").show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:58:15.391464+08:00
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+

end   time:  2022-10-30 23:58:16.747805+08:00


In [43]:
print("start time: ", datetime.datetime.now(tw))

data = spark.range(10, 20)
# data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
data.write.format("delta").mode("overwrite").save("delta-table1")

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:58:16.765571+08:00
end   time:  2022-10-30 23:58:24.470930+08:00


In [44]:
print("start time: ", datetime.datetime.now(tw))

# deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")
deltaTable = DeltaTable.forPath(spark, "delta-table1")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:58:24.534133+08:00
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+

end   time:  2022-10-30 23:58:51.930297+08:00


In [45]:
print("start time: ", datetime.datetime.now(tw))

df = spark.read.format("delta") \
  .option("versionAsOf", 0) \
  .load("delta-table1")

df.show()

print("end   time: ", datetime.datetime.now(tw))

start time:  2022-10-30 23:58:51.949936+08:00
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+

end   time:  2022-10-30 23:58:58.759889+08:00
