In [0]:
# Task 1: Creating Delta Table using Three Methods
# 1. Load the given CSV and JSON datasets into Databricks.
# 2. Create a Delta table using the following three methods:
# Create a Delta table from a DataFrame.
# Use SQL to create a Delta table.
# Convert both the CSV and JSON files into Delta format.
dbutils.fs.cp("file:/Workspace/Shared/employee.csv", "dbfs:/FileStore/employee.csv")
# Load the file from DBFS
df_csv = spark.read.format("csv").option("header", "true").load("/FileStore/employee.csv")
df_csv.show()

dbutils.fs.cp("file:/Workspace/Shared/products.json", "dbfs:/FileStore/products.json")

# Load the file from DBFS
df_json = spark.read.option("multiline", "true").json("/FileStore/products.json")
df_json.show()



+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+



In [0]:

df_csv.write.format("delta").mode("overwrite").save("/Workspace/Shared/employees_delta")

df_csv.createOrReplaceTempView("employees_view")

spark.sql("""
CREATE TABLE employees_deltasql
USING DELTA
AS SELECT * FROM employees_view
""")

df_csv.write.format("delta").mode("overwrite").save("/Workspace/Shared/employees_delta_csv")

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/products.json","dbfs:/FileStore/products.json")
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

#Define schema for JSON file
schema=StructType([
    StructField("ProductID",StringType(),True),
    StructField("ProductName",StringType(),True),
    StructField("Category",StringType(),True),
    StructField("Price",IntegerType(),True),
])

#Load JSON data with schema

df_product=spark.read.format("json").schema(schema).load("/FileStore/products.json")
df_product.show()

+---------+-----------+--------+-----+
|ProductID|ProductName|Category|Price|
+---------+-----------+--------+-----+
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
|     NULL|       NULL|    NULL| NULL|
+---------+-----------+--------+-----+



In [0]:
# Task 2: Merge and Upsert (Slowly Changing Dimension - SCD)
# 1. Load the Delta table for employees created in Task 1.
# 2. Merge the new employee data into the employees Delta table.
# 3. If an employee exists, update their salary. If the employee is new, insert
# their details.
employees_delta = spark.read.format("delta").load("/Workspace/Shared/employees_delta")

# New employee data
new_employee_data = [(102, "Alice", "Finance", "2023-02-15", 75000),  # Updated Salary
                     (106, "Olivia", "HR", "2023-06-10", 65000)]  # New Employee

columns = ["EmployeeID", "EmployeeName", "Department", "JoiningDate", "Salary"]
new_employees_df = spark.createDataFrame(new_employee_data, columns)

# Merge using Delta Lake's merge functionality
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/Workspace/Shared/employees_delta")

delta_table.alias("tgt").merge(
    new_employees_df.alias("src"),
    "tgt.EmployeeID = src.EmployeeID"
).whenMatchedUpdate(set={"Salary": "src.Salary"}) \
 .whenNotMatchedInsertAll() \
 .execute()



In [0]:
# Task 3: Internals of Delta Table
delta_table.history().show()


+-------+-------------------+---------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|         userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+---------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2024-09-17 05:14:37|538936938566393|azuser2126_mml.lo...|    MERGE|{predicate -> ["(...|NULL|{4178872502868695}|0917-044935-aaw3fra4|          2|WriteSerializable|        false|{numTargetRowsCop...|        NULL|Databricks-Runtim...|
|      2|2024-09-17 05:00:36|538

In [0]:
previous_version = delta_table.history(1).select("version").collect()[0][0] - 1

df_time_travel = spark.read.format("delta").option("versionAsOf", previous_version).load("/Workspace/Shared/employees_delta")
df_time_travel.show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+



In [0]:
# Task 4: Optimize Delta Table
spark.sql("OPTIMIZE delta.`/Workspace/Shared/employees_delta`")


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
spark.sql("OPTIMIZE delta.`/Workspace/Shared/employees_delta` ZORDER BY (Department)")


DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# Task 5: Time Travel with Delta Table
df_time_travel.show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+



In [0]:
specific_version = 1  
df_specific_version = spark.read.format("delta").option("versionAsOf", specific_version).load("/Workspace/Shared/employees_delta")
df_specific_version.show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+



In [0]:
# Task 6: Vacuum Delta Table
# 1. Use the vacuum operation on the employees Delta table to remove old versions
# and free up disk space.
# 2. Set the retention period to 7 days and ensure that old files are deleted.
spark.sql("VACUUM delta.`/Workspace/Shared/employees_delta` ")
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled = false")
spark.sql("VACUUM delta.`/Workspace/Shared/employees_delta` RETAIN 168 HOURS")  


DataFrame[path: string]