In [0]:
# Move the file from Workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/employee_data.csv", "dbfs:/FileStore/employee_data.csv")

# Load CSV data into a DataFrame
df_employee = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/employee_data.csv")

# Write DataFrame to Delta format with overwrite option
df_employee.write.format("delta").mode("overwrite").save("/delta/employee data")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Define schema for JSON file
schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Stock", IntegerType(), True)
])

# Move the file from Workspace to DBFS 
dbutils.fs.cp("file:/Workspace/Shared/product_data.json", "dbfs:/FileStore/product_data.json")

# Load JSON data with schema
df_product = spark.read.format("json").schema(schema).load("dbfs:/FileStore/product_data.json")
df_product.show()

# Create a temp view for SQL operations
df_product.createOrReplaceTempView("product_view")

# Create a Delta table from the view if it does not already exist
spark.sql("""
    CREATE TABLE IF NOT EXISTS delta_product_table_
    USING DELTA
    AS SELECT * FROM product_view
""")

+---------+-----------+-----------+------+-----+
|ProductID|ProductName|   Category| Price|Stock|
+---------+-----------+-----------+------+-----+
|      101|     Laptop|Electronics|1200.0|   35|
|      102| Smartphone|Electronics| 800.0|   80|
|      103| Desk Chair|  Furniture| 150.0|   60|
|      104|    Monitor|Electronics| 300.0|   45|
|      105|       Desk|  Furniture| 350.0|   25|
+---------+-----------+-----------+------+-----+



DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("SELECT * FROM delta_product_table_")

DataFrame[ProductID: int, ProductName: string, Category: string, Price: double, Stock: int]

employee_updates.csv


In [0]:
# Move the file from Workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/employee_updates.csv", "dbfs:/FileStore/employee_updates.csv")

True

In [0]:
# Convert employee CSV data to Delta format
df_employee = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/employee_updates.csv")
df_employee.write.format("delta").mode("overwrite").save("/delta/employee_data")

In [0]:
# Convert employee updates CSV data to Delta format
df_employee_updates = spark.read.format("csv").option("header", "true").load("/FileStore/employee_updates.csv")
df_employee_updates.write.format("delta").mode("overwrite").save("/delta/employee_updates")

In [0]:
# Load Delta tables
df_employee = spark.read.format("delta").load("/delta/employee_data")
df_employee_updates = spark.read.format("delta").load("/delta/employee_updates")

# Create temporary views for SQL operations
df_employee.createOrReplaceTempView("delta_employee")
df_employee_updates.createOrReplaceTempView ("employee_updates")

In [0]:
spark.sql("""
        MERGE INTO delta_employee AS target
        USING employee_updates AS source
        ON target.EmployeeID = source.EmployeeID
        WHEN MATCHED THEN UPDATE SET target.Salary = source.Salary, target.Department = source.Department
        WHEN NOT MATCHED THEN INSERT (EmployeeID, Name, Department, JoiningDate, Salary)
        VALUES (source.EmployeeID, source.Name, source.Department, source.JoiningDate, source.Salary)
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql("SELECT * FROM delta_employee").show()

+----------+-----------+----------+-----------+------+
|EmployeeID|       Name|Department|JoiningDate|Salary|
+----------+-----------+----------+-----------+------+
|      1001|   John Doe|        HR| 2021-01-15| 58000|
|      1009|Sarah Adams| Marketing| 2021-09-01| 60000|
|      1010|Robert King|        IT| 2022-01-10| 62000|
+----------+-----------+----------+-----------+------+



In [0]:
# Write the employee DataFrame to a Delta table
df_employee.write.format("delta").mode("overwrite").save("delta/employee_data")

In [0]:
# Register the Delta Table
spark.sql("CREATE TABLE IF NOT EXISTS delta_employee_table USING DELTA LOCATION '/delta/employee_data'")

In [0]:
# Optimize the Delta Table
spark.sql("OPTIMIZE delta_employee_table")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [0]:
spark.sql("SELECT * FROM delta_employee_table").show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1002|   Jane Smith|        IT| 2020-03-10| 65100|
|      1005| David Wilson|        IT| 2021-06-25| 60900|
|      1007| James Miller|        IT| 2019-08-14| 68250|
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+



In [0]:
# Describe the history of the Delta Table
spark.sql("DESCRIBE HISTORY delta_employee_table").show(truncate=False)

+-------+-------------------+----------------+----------------------------------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+------------------------------------------+
|version|timestamp          |userId          |userName                          |operation             |operationParameters                                                                                                                    |job |notebook          |clusterId           |readVersion|

In [0]:
spark.sql("OPTIMIZE delta_employee_table ZORDER BY Department")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [0]:
spark.sql("VACUUM delta_employee_table RETAIN 168 HOURS")

DataFrame[path: string]