### Delta Lake
- **Understand SPARK Session**
- **Check for Catalog Implementation [Hive]**
- **List the databases in a catalog**
- **List the tables in a databases within a catalog**
- **Understand DBFS**
- **Upload files to DBFS**
- **Browse DBFS using UI and using DBUTILS**
- **Create Parquet format table using PySpark**
- **Create Delta table using PySpark**
- **Understand the difference between Parquet and Delta Table**
- **How Delta Table manages Metadata? What is Delta Log?**
- **How Versioning works in Delta Lake?**
- **DML Operations in Delta Table**
- **Time Travel in Delta Lake**
- **Schema Evolution in Delta Lake**
- **Convert Parquet to Delta Table**
- **Restoring to a version in Delta Lake**
- **Vacuum operations in Delta Lake**

In [0]:
# SPARK Session
spark

In [0]:
# Check for SPARK SQL Catalog Implementation - In-Memory, Hive, Unity Catalog
spark.conf.get("spark.sql.catalogImplementation")


'hive'

In [0]:

%sql
show databases;

databaseName
default


In [0]:
%sql
create database qbex

In [0]:

%sql
show tables in qbex;

database,tableName,isTemporary
,_sqldf,True


In [0]:
# Employee Data and Schema
emp_data = [
    ["E1","Pratap","Hyderabad",1000],
    ["E2","Sruthi","Anantapur",2000],
    ["E3","Kiyanshitha","Vizag",3000]
]

emp_schema = "empno string, empname string, city string, salary integer"

In [0]:
# Create DataFrame from Data and Schema
emp_df = spark.createDataFrame(data=emp_data, schema=emp_schema)
display(emp_df)

empno,empname,city,salary
E1,Pratap,Hyderabad,1000
E2,Sruthi,Anantapur,2000
E3,Kiyanshitha,Vizag,3000


In [0]:
# display(dbutils.fs.ls("/FileStore/data/"))

dbutils.fs.head("/FileStore/data/emp_file.csv")

'empno,empname,city,salary\r\nE1,Pratap,Hyderabad,1000\r\nE2,Sruthi,Anantapur,2000\r\nE3,Kiyanshitha,Vizag,3000\r\n'

In [0]:
emp_fs_df = spark.read.format("csv") \
                       .option("header", True) \
                       .option("inferSchema", True) \
                       .load("/FileStore/data/emp_file.csv")    
display(emp_fs_df)

empno,empname,city,salary
E1,Pratap,Hyderabad,1000
E2,Sruthi,Anantapur,2000
E3,Kiyanshitha,Vizag,3000


In [0]:
# Understand DBFS

In [0]:
# Upload Files to DBFS

In [0]:
# Browse Files in DBFS

In [0]:
# Create Parquet format table using PySpark
emp_fs_df.write.format("parquet").mode("overwrite").option("path","/FileStore/data/sales_parquet_2").saveAsTable("sales_parquet_2")

In [0]:

%sql
describe extended sales_parquet_2

col_name,data_type,comment
empno,string,
empname,string,
city,string,
salary,int,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,default,
Table,sales_parquet_2,
Owner,root,


In [0]:
%sql
select * from sales_parquet_2

empno,empname,city,salary
E1,Pratap,Hyderabad,1000
E2,Sruthi,Anantapur,2000
E3,Kiyanshitha,Vizag,3000


In [0]:
# Understand the File Structure of Parquet Table
display(dbutils.fs.ls("/FileStore/data/sales_parquet_2"))

path,name,size,modificationTime
dbfs:/FileStore/data/sales_parquet_2/_SUCCESS,_SUCCESS,0,1734945510000
dbfs:/FileStore/data/sales_parquet_2/_committed_822393280317452396,_committed_822393280317452396,122,1734945510000
dbfs:/FileStore/data/sales_parquet_2/_started_822393280317452396,_started_822393280317452396,0,1734945507000
dbfs:/FileStore/data/sales_parquet_2/part-00000-tid-822393280317452396-c3b667af-027e-496e-bdca-42f60f791629-25-1-c000.snappy.parquet,part-00000-tid-822393280317452396-c3b667af-027e-496e-bdca-42f60f791629-25-1-c000.snappy.parquet,1255,1734945510000


In [0]:
# Create Delta format table using PySpark
emp_fs_df.write.format("delta").mode("overwrite").option("path", "/FileStore/data/sales_delta_2").saveAsTable("sales_delta_2")

In [0]:

%sql
describe extended sales_delta_2

col_name,data_type,comment
empno,string,
empname,string,
city,string,
salary,int,
,,
# Delta Statistics Columns,,
Column Names,"empno, empname, city, salary",
Column Selection Method,first-32,
,,
# Detailed Table Information,,


In [0]:
%sql
select * from sales_delta_2

empno,empname,city,salary
E1,Pratap,Hyderabad,1000
E2,Sruthi,Anantapur,2000
E3,Kiyanshitha,Vizag,3000


In [0]:
# Understand the File Structure of Delta Table & also Delta Log
display(dbutils.fs.ls("/FileStore/data/sales_delta_2/_delta_log/"))

path,name,size,modificationTime
dbfs:/FileStore/data/sales_delta_2/_delta_log/00000000000000000000.crc,00000000000000000000.crc,2743,1734945703000
dbfs:/FileStore/data/sales_delta_2/_delta_log/00000000000000000000.json,00000000000000000000.json,1869,1734945694000
dbfs:/FileStore/data/sales_delta_2/_delta_log/_commits/,_commits/,0,0


In [0]:
%sql
describe history sales_delta_2

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-12-23T09:21:34Z,141785285425900,qbex.azure@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(4005581910905466),1223-070356-nnvl4fpv,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 1255)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
%sql
-- select * from sales_delta_2;
select * from sales_parquet_2;

-- update sales_parquet_2 set salary = 9999 where empno = 'E1';
-- DELETE from sales_parquet_2 where empno = 'E1';
-- insert into sales_parquet_2 values ('E4','nirupama','hyderabad',50000)

empno,empname,city,salary
E1,Pratap,Hyderabad,1000
E2,Sruthi,Anantapur,2000
E3,Kiyanshitha,Vizag,3000
E4,nirupama,hyderabad,50000


In [0]:
%sql
-- select * from sales_delta_2;
describe history sales_delta_2;
-- select * from sales_parquet_2;

-- update sales_delta_2 set salary = 8888 where empno = 'E1';
-- DELETE from sales_delta_2 where empno = 'E1';
-- insert into sales_delta_2 values ('E4','nirupama','hyderabad',50000)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
4,2024-12-23T09:30:47Z,141785285425900,qbex.azure@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(4005581910905466),1223-070356-nnvl4fpv,3.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1242)",,Databricks-Runtime/15.4.x-scala2.12
3,2024-12-23T09:29:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#3836 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1713, numDeletionVectorsUpdated -> 0, scanTimeMs -> 718, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 995)",,Databricks-Runtime/15.4.x-scala2.12
2,2024-12-23T09:28:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2854 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 2189, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1357, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 832)",,Databricks-Runtime/15.4.x-scala2.12
1,2024-12-23T09:28:02Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2283 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3129, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1751, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 1364)",,Databricks-Runtime/15.4.x-scala2.12
0,2024-12-23T09:21:34Z,141785285425900,qbex.azure@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(4005581910905466),1223-070356-nnvl4fpv,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 1255)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
# Query Delta table of different versions - Time Travel

In [0]:
# Schema Evolution - Change schema for one record
# Employee Data and Schema
emp_data = [
    ["E5","Chand","Hyderabad",77777, "India"]
]

emp_schema = "empno string, empname string, city string, salary integer, countr string"

In [0]:
emp_new_df = spark.createDataFrame(data=emp_data, schema=emp_schema)

In [0]:
emp_new_df.write.format("delta").mode("append").option("mergeSchema",True).option("path", "/FileStore/data/sales_delta_2").saveAsTable("sales_delta_2")

In [0]:
%sql
select * from sales_delta_2

empno,empname,city,salary,countr
E5,Chand,Hyderabad,77777,India
E1,Pratap,Hyderabad,8888,
E2,Sruthi,Anantapur,2000,
E3,Kiyanshitha,Vizag,3000,
E4,nirupama,hyderabad,50000,


In [0]:
# DeltaTable Library
from delta import DeltaTable
dt = DeltaTable.forName(spark,"sales_delta_2")
display(dt.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
5,2024-12-23T09:47:25Z,141785285425900,qbex.azure@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(4005581910905466),1223-070356-nnvl4fpv,4.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1466)",,Databricks-Runtime/15.4.x-scala2.12
4,2024-12-23T09:30:47Z,141785285425900,qbex.azure@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(4005581910905466),1223-070356-nnvl4fpv,3.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1242)",,Databricks-Runtime/15.4.x-scala2.12
3,2024-12-23T09:29:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#3836 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1713, numDeletionVectorsUpdated -> 0, scanTimeMs -> 718, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 995)",,Databricks-Runtime/15.4.x-scala2.12
2,2024-12-23T09:28:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2854 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 2189, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1357, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 832)",,Databricks-Runtime/15.4.x-scala2.12
1,2024-12-23T09:28:02Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2283 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3129, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1751, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 1364)",,Databricks-Runtime/15.4.x-scala2.12
0,2024-12-23T09:21:34Z,141785285425900,qbex.azure@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(4005581910905466),1223-070356-nnvl4fpv,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 1255)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
#Convert Parquet to Delta Table
DeltaTable.isDeltaTable(spark,"/FileStore/data/sales_parquet_2")

False

In [0]:
DeltaTable.convertToDelta(spark,"parquet.`/FileStore/data/sales_parquet_2`")

<delta.tables.DeltaTable at 0x7ffa7db53a50>

In [0]:
display(dbutils.fs.ls("/FileStore/data/sales_parquet_2"))

path,name,size,modificationTime
dbfs:/FileStore/data/sales_parquet_2/_SUCCESS,_SUCCESS,0,1734946217000
dbfs:/FileStore/data/sales_parquet_2/_committed_4376580291416143822,_committed_4376580291416143822,124,1734946217000
dbfs:/FileStore/data/sales_parquet_2/_committed_6589554690584742466,_committed_6589554690584742466,123,1734946013000
dbfs:/FileStore/data/sales_parquet_2/_committed_822393280317452396,_committed_822393280317452396,122,1734945510000
dbfs:/FileStore/data/sales_parquet_2/_delta_log/,_delta_log/,0,0
dbfs:/FileStore/data/sales_parquet_2/_started_4376580291416143822,_started_4376580291416143822,0,1734946216000
dbfs:/FileStore/data/sales_parquet_2/_started_6589554690584742466,_started_6589554690584742466,0,1734946012000
dbfs:/FileStore/data/sales_parquet_2/_started_822393280317452396,_started_822393280317452396,0,1734945507000
dbfs:/FileStore/data/sales_parquet_2/part-00000-tid-4376580291416143822-0a9fa6c7-7345-4370-9549-b8701811137e-193-1-c000.snappy.parquet,part-00000-tid-4376580291416143822-0a9fa6c7-7345-4370-9549-b8701811137e-193-1-c000.snappy.parquet,1218,1734946216000
dbfs:/FileStore/data/sales_parquet_2/part-00000-tid-6589554690584742466-751593de-6e17-44bf-b88d-1a91277f76a6-80-1-c000.snappy.parquet,part-00000-tid-6589554690584742466-751593de-6e17-44bf-b88d-1a91277f76a6-80-1-c000.snappy.parquet,1218,1734946013000


In [0]:
%sql
describe extended sales_parquet_2
-- convert to delta sales_parquet_2

col_name,data_type,comment
empno,string,
empname,string,
city,string,
salary,int,
,,
# Delta Statistics Columns,,
Column Names,"empname, city, salary, empno",
Column Selection Method,first-32,
,,
# Detailed Table Information,,


In [0]:

%sql
-- restore sales_delta_2 to version as of 1
-- select * from sales_delta_2
describe history sales_delta_2

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
6,2024-12-23T10:00:34Z,141785285425900,qbex.azure@gmail.com,RESTORE,"Map(version -> 1, timestamp -> null)",,List(4005581910905466),1223-070356-nnvl4fpv,5.0,Serializable,False,"Map(numRestoredFiles -> 1, removedFilesSize -> 3963, numRemovedFiles -> 3, restoredFilesSize -> 1255, numOfFilesAfterRestore -> 1, tableSizeAfterRestore -> 1255)",,Databricks-Runtime/15.4.x-scala2.12
5,2024-12-23T09:47:25Z,141785285425900,qbex.azure@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(4005581910905466),1223-070356-nnvl4fpv,4.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1466)",,Databricks-Runtime/15.4.x-scala2.12
4,2024-12-23T09:30:47Z,141785285425900,qbex.azure@gmail.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,List(4005581910905466),1223-070356-nnvl4fpv,3.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 1, numOutputBytes -> 1242)",,Databricks-Runtime/15.4.x-scala2.12
3,2024-12-23T09:29:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#3836 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1713, numDeletionVectorsUpdated -> 0, scanTimeMs -> 718, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 995)",,Databricks-Runtime/15.4.x-scala2.12
2,2024-12-23T09:28:14Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2854 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 2189, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1357, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 832)",,Databricks-Runtime/15.4.x-scala2.12
1,2024-12-23T09:28:02Z,141785285425900,qbex.azure@gmail.com,UPDATE,"Map(predicate -> [""(empno#2283 = E1)""])",,List(4005581910905466),1223-070356-nnvl4fpv,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 1, numRemovedBytes -> 1255, numCopiedRows -> 2, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 3129, numDeletionVectorsUpdated -> 0, scanTimeMs -> 1751, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 1255, rewriteTimeMs -> 1364)",,Databricks-Runtime/15.4.x-scala2.12
0,2024-12-23T09:21:34Z,141785285425900,qbex.azure@gmail.com,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> false, properties -> {}, statsOnLoad -> false)",,List(4005581910905466),1223-070356-nnvl4fpv,,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 3, numOutputBytes -> 1255)",,Databricks-Runtime/15.4.x-scala2.12


In [0]:
#Vacuum operations in Delta Lake
from delta import DeltaTable
dt = DeltaTable.forName(spark,"sales_delta_2")

spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled","false")
dt.vacuum(0)
# # Get table properties
# properties = dt.detail().collect()

# # Display properties
# for prop in properties:
#     print(prop)

DataFrame[]

In [0]:
%sql
select * from sales_delta_2@v0

org.apache.spark.SparkException: [FAILED_READ_FILE.DBR_FILE_NOT_EXIST] Error while reading file dbfs:/FileStore/data/sales_delta_2/part-00000-60f5a60d-da17-4821-ab3e-4957102d2ed7-c000.snappy.parquet. [DELTA_FILE_NOT_FOUND_DETAILED] File dbfs:/FileStore/data/sales_delta_2/part-00000-60f5a60d-da17-4821-ab3e-4957102d2ed7-c000.snappy.parquet referenced in the transaction log cannot be found. This occurs when data has been manually deleted from the file system rather than using the table `DELETE` statement. For more information, see https://docs.databricks.com/delta/delta-intro.html#frequently-asked-questions SQLSTATE: KD001
	at org.apache.spark.sql.errors.QueryExecutionErrors$.fileNotExistErrorDBR(QueryExecutionErrors.scala:1069)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.logMissingFileNameAndThrow(FileScanRDD.scala:780)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:676)
	at org.apache.spark.util.NextIte