In [0]:
csv_data = """id,name,category,price
1,Amit,Electronics,50000
2,Priya,Furniture,3000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000
"""

# Save to DBFS 
dbutils.fs.put("dbfs:/tmp/products.csv", csv_data, overwrite=True)

Wrote 139 bytes.


True

In [0]:
df = spark.read.option("header", True).option("inferSchema",True).csv("dbfs:/tmp/products.csv")
display(df)

id,name,category,price
1,Amit,Electronics,50000
2,Priya,Furniture,3000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000


In [0]:
df.write.format("delta").mode("overwrite").save("/tmp/delta/products")

In [0]:
df_delta = spark.read.format("delta").load("/tmp/delta/products")
display(df_delta)

id,name,category,price
1,Amit,Electronics,50000
2,Priya,Furniture,3000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000


In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark,"/tmp/delta/products")

# Update price for the product where id=2
delta_table.update(
  condition = "id = 2",
  set = { "price": "3500"}
)

# Verify Update
display(df_delta)

id,name,category,price
1,Amit,Electronics,50000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000
2,Priya,Furniture,3500


In [0]:
new_data = [
    (2, "Priya", "Furniture", 4000),     # Update(It is already available)
    (6, "Sneha", "Kitchen", 1200)        # Insert(It is not already available)
]

updates_df = spark.createDataFrame(new_data, ["id", "name", "category", "price"])

# Perform Upsert
delta_table.alias("target").merge(
  updates_df.alias("source"),
  "target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
 
# Check data again
display(df_delta)

id,name,category,price
1,Amit,Electronics,50000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000
2,Priya,Furniture,4000
6,Sneha,Kitchen,1200


In [0]:
# We can time travel in delta whereas in other files like csv json we can't

# Current version
delta_table.toDF().show()

# Read previous version
previous_df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta/products")
previous_df.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  2|  Priya|  Furniture| 4000|
|  6|  Sneha|    Kitchen| 1200|
+---+-------+-----------+-----+

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



In [0]:
# Partitioning
# seggregating data based on specific or multiple criteria so that it is optimal

df.write.format("delta").mode("overwrite").partitionBy("category").save("/tmp/delta/products_partitioned")


In [0]:
spark.read.format("delta").load("/tmp/delta/products_partitioned").show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  5|Karthik|Electronics|45000|
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
+---+-------+-----------+-----+

