In [0]:
# 1. Convert CSV and JSON Data to Delta Format:
# Convert the employee_data.csv and product_data.json into Delta Tables.Save the Delta tables to a specified location.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

Employee_data_path = "file:/Workspace/Shared/Employee_data.csv"
Product_data_path = "file:/Workspace/Shared/Product_data.json"

schema = StructType([
    StructField("ProductID", IntegerType(), True),StructField("ProductName", StringType(), True),StructField("Category", StringType(), True),StructField("Price", IntegerType(), True),StructField("Stock", IntegerType(), True)])

Employee_df = spark.read.csv(Employee_data_path, header=True, inferSchema=True)
Employee_df.show()
Employee_df.write.saveAsTable("delta_employee_table")
Employee_df.write.format("delta").mode("overwrite").save("/Workspace/Shared/delta_Employee_table")
df_Employee_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Employee_table")
df_Employee_delta.show()


Product_df = spark.read.json(Product_data_path, schema=schema)
Product_df.show()
Product_df.write.saveAsTable("delta_product_table")
Product_df.write.format("delta").mode("overwrite").save("/Workspace/Shared/delta_Product_table")
df_Product_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Product_table")
df_Product_delta.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   F

In [0]:
# 2. Register Delta Tables:
# Register both the employee and product Delta tables as SQL tables.

df_Employee_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Employee_table")
df_Employee_delta.show()

df_Product_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Product_table")
df_Product_delta.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Elec

In [0]:
# 3. Data Modifications with Delta Tables:

# Perform an update operation on the employee Delta table: Increase the salary by 5% for all employees in the IT department.
spark.sql("""
    UPDATE delta.`/Workspace/Shared/delta_Employee_table`
    SET Salary = Salary * 1.05
    WHERE Department = 'IT'
""")

df_Employee_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Employee_table")
df_Employee_delta.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
|      1002|   Jane Smith|        IT| 2020-03-10| 65100|
|      1005| David Wilson|        IT| 2021-06-25| 60900|
|      1007| James Miller|        IT| 2019-08-14| 68250|
+----------+-------------+----------+-----------+------+



In [0]:
# Perform a delete operation on the product Delta table: Delete products where the stock is less than 40.
spark.sql("""
    DELETE FROM delta.`/Workspace/Shared/delta_Product_table`
    WHERE Stock < 40
""")

df_Product_delta = spark.read.format("delta").load("/Workspace/Shared/delta_Product_table")
df_Product_delta.show()

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
+---------+-----------+-----------+-----+-----+



In [0]:
# 4. Time Travel with Delta Tables:

# Query the product Delta table to show its state before the delete operation (use time travel).
df_version_before_delete = spark.sql("SELECT * FROM delta_Product_table VERSION AS OF 0")
df_version_before_delete.show()

# Retrieve the version of the employee Delta table before the salary update.
df_version_before_update = spark.sql("SELECT * FROM delta_Employee_table VERSION AS OF 0")
df_version_before_update.show()

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
|      103| Desk Chair|  Furniture|  150|   60|
|      104|    Monitor|Electronics|  300|   45|
|      105|       Desk|  Furniture|  350|   25|
+---------+-----------+-----------+-----+-----+

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 6500

In [0]:
# 5. Query Delta Tables:

# Query the employee Delta table to find the employees in the Finance department.
df_fin_emp = spark.sql("SELECT * FROM delta_Employee_table WHERE Department = 'Finance'")
df_fin_emp.show()

# Query the product Delta table to find all products in the Electronics category with a price greater than 500.
df_elec_products = spark.sql("SELECT * FROM delta_product_table WHERE Category = 'Electronics' AND Price > 500")
df_elec_products.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
+----------+-------------+----------+-----------+------+

+---------+-----------+-----------+-----+-----+
|ProductID|ProductName|   Category|Price|Stock|
+---------+-----------+-----------+-----+-----+
|      101|     Laptop|Electronics| 1200|   35|
|      102| Smartphone|Electronics|  800|   80|
+---------+-----------+-----------+-----+-----+

