In [0]:
# Load employee.csv file data
df_employee = spark.read.csv('/FileStore/employee_data.csv', header=True, inferSchema=True).cache()
df_employee.show()
df_employee.printSchema()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DoubleType

# Define the schema
schema=StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(),True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Stock",IntegerType(),True)
]
)

In [0]:
#Load json data with schema
df_product=spark.read.format("json").schema(schema).load("/FileStore/product_data.json")
df_product.show()

+---------+-----------+-----------+------+-----+
|ProductID|ProductName|   Category| Price|Stock|
+---------+-----------+-----------+------+-----+
|      101|     Laptop|Electronics|1200.0|   35|
|      102| Smartphone|Electronics| 800.0|   80|
|      103| Desk Chair|  Furniture| 150.0|   60|
|      104|    Monitor|Electronics| 300.0|   45|
|      105|       Desk|  Furniture| 350.0|   25|
+---------+-----------+-----------+------+-----+



In [0]:
# 2. Convert CSV and JSON Data to Delta Format
df_employee.write.format("delta").mode("overwrite").save("/dbfs/FileStore/delta/employee_data")

# create temp view for sql operations
df_product.createOrReplaceTempView("product_view")
# create delta table from view
spark.sql("CREATE TABLE if not EXISTS delta_product_table USING DELTA AS SELECT * FROM product_view")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
# 3. Register Delta Tables as SQL Tables
spark.sql("CREATE TABLE IF NOT EXISTS delta_employee_table USING DELTA ")


DataFrame[]

In [0]:
# 3. Data Modifications with Delta Tables
# Increase salary by 5% for IT department employees
spark.sql("UPDATE employee_delta SET Salary = Salary * 1.05 WHERE Department = 'IT'")
# Delete products where stock is less than 40
spark.sql("DELETE FROM product_delta WHERE Stock < 40")

DataFrame[num_affected_rows: bigint]

In [0]:
# 4. Time Travel with Delta Tables:
# Query the product Delta table to show its state before the delete
# operation (use time travel).
df_product_version_before_delete = spark.sql("SELECT * FROM product_delta VERSION AS OF 0")
df_product_version_before_delete.show()
# Retrieve the version of the employee Delta table before the salary update.
df_employee_version_before_update = spark.sql("SELECT * FROM employee_delta VERSION AS OF 0")
df_employee_version_before_update.show()

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 6500

In [0]:
# 5. Query Delta Tables:
# Query the employee Delta table to find the employees in the Finance department.
df_finance_employees = spark.sql("SELECT * FROM employee_delta WHERE Department = 'Finance'")
df_finance_employees.show()
# Query the product Delta table to find all products in the Electronics category with a price greater than 500.
df_expensive_electronics = spark.sql("SELECT * FROM product_delta WHERE Category = 'Electronics' AND Price > 500")
df_expensive_electronics.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
+----------+-------------+----------+-----------+------+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics|  800|      102| Smartphone|   80|
+-----------+-----+---------+-----------+-----+



In [0]:
dbutils.fs.cp("file:/Workspace/Shared/employees_updates.csv", "dbfs:/FileStore/employees_updates.csv")

True

In [0]:
df_employee=spark.read.format("csv").option("header","true").load("/FileStore/employee_data.csv")
df_employee.write.format("delta").mode("overwrite").save("/delta/employee_data")

#convert employee updates csv to delta format
df_employee_updates=spark.read.format("csv").option("header","true").load("/FileStore/employees_updates.csv")
df_employee_updates.write.format("delta").mode("overwrite").save("/delta/employee_updates")


In [0]:
# Load delta tables
df_employee=spark.read.format("delta").load("/delta/employee_data")
df_employee_updates=spark.read.format("delta").load("/delta/employee_updates")

# Create temporary views for SQL operations
df_employee.createOrReplaceTempView("delta_employee")
df_employee_updates.createOrReplaceTempView("employee_updates")


In [0]:
spark.sql("""
          MERGE INTO delta_employee AS target
          USING employee_updates AS source
          ON target.EmployeeID = source.EmployeeID
          WHEN MATCHED THEN UPDATE SET target.salary = source.Salary, target.Department=source.Department
          WHEN NOT MATCHED THEN INSERT (EmployeeID, Name, Department, JoiningDate, Salary)
          VALUES (source.EmployeeID, source.Name, source.Department, source.JoiningDate, source.Salary)
""")


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
# Query the Delta table to check if the data was updated or inserted correctly
spark.sql("SELECT * FROM delta_employee").show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
|      1001|     John Doe|        HR| 2021-01-15| 58000|
|      1009|  Sarah Adams| Marketing| 2021-09-01| 60000|
|      1010|  Robert King|        IT| 2022-01-10| 62000|
+----------+-------------+----------+-----------+------+



In [0]:
# write the employee dataframe to a delta table
df_employee.write.format("delta").mode("overwrite").save("/delta/employee_data")

In [0]:
# Register delta table
spark.sql("CREATE TABLE IF NOT EXISTS delta_employee_table USING DELTA LOCATION '/delta/employee_data'")

DataFrame[]

In [0]:
# Optimize the delta table
spark.sql("OPTIMIZE delta_employee_table")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>,de

In [0]:
# Describe the history of delta table
spark.sql("DESCRIBE HISTORY delta_employee_table").show(truncate=False)

+-------+-------------------+----------------+----------------------------------+------------+---------------------------------------------------------------------------------------------------------------------------------------+----+------------------+--------------------+-----------+-----------------+-------------+----------------+------------+------------------------------------------+
|version|timestamp          |userId          |userName                          |operation   |operationParameters                                                                                                                    |job |notebook          |clusterId           |readVersion|isolationLevel   |isBlindAppend|operationMetrics|userMetadata|engineInfo                                |
+-------+-------------------+----------------+----------------------------------+------------+------------------------------------------------------------------------------------------------------------------------