CSV

In [0]:
# Move the file from Workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/employee_data.csv", "dbfs:/FileStore/employee_data.csv")
# Reading data from a CSV file
df_csv=spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/employee_data.csv")
df_csv.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+



In [0]:
from pyspark.sql.functions import year

# Remove rows where Salary is less than 55,000
df_cleaned = df_csv.filter(df_csv['Salary'] >= 55000)
df_cleaned.show()

# Filter employees who joined after the year 2020
df_cleaned = df_cleaned.filter(year(df_cleaned['JoiningDate']) > 2020)
df_cleaned.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
+----------+-------------+----------+-----------+------+

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+



In [0]:
# Find the average salary by Department
avg_salary_by_dept = df_cleaned.groupby('Department').agg({'Salary': 'mean'})
print("Average Salary by Department:")
display(avg_salary_by_dept)

# Count the number of employees in each Department
employee_count_by_dept = df_cleaned.groupby('Department').agg({'EmployeeID': 'count'})
print("Employee Count by Department:")
display(employee_count_by_dept)

Average Salary by Department:


Department,avg(Salary)
HR,55000.0
IT,58000.0


Employee Count by Department:


Department,count(EmployeeID)
HR,1
IT,1


In [0]:
df_cleaned.write.format("csv").option("header","true").save("/FileStore/cleaned_employee_data.csv")

In [0]:
df_csv = spark.read.format("csv").option("header", "true").load("/FileStore/cleaned_employee_data.csv")
df_csv.show()

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+



JSON

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/product_data.json", "dbfs:/FileStore/product_data.json")

True

In [0]:
# Load the file from DBFS
df = spark.read.option("multiline", "true").json("/FileStore/product_data.json")
df.show(10)
df.printSchema()

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

root
 |-- Category: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- ProductID: long (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Stock: long (nullable = true)



In [0]:
# Remove rows where Stock is less than 30.
# Filter the products that belong to the "Electronics" category.
df_cleaned_product = df.filter((df['Stock'] >= 30) & (df['Category'] == 'Electronics'))
df_cleaned_product.show()

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+



In [0]:
# Calculate the total stock for products in the "Furniture" category.
df_total_furniture_stock = df.filter(df['Category'] == 'Furniture').groupBy('Category').agg({'Stock': 'sum'}).withColumnRenamed('sum(Stock)', 'TotalStock')
df_total_furniture_stock.show()

# Find the average price of all products in the dataset.
df_avg_price = df.groupBy('Category').agg({'Price': 'avg'}).withColumnRenamed('avg(Price)', 'AvgPrice')
df_avg_price.show()

+---------+----------+
| Category|TotalStock|
+---------+----------+
|Furniture|        85|
+---------+----------+

+-----------+-----------------+
|   Category|         AvgPrice|
+-----------+-----------------+
|Electronics|766.6666666666666|
|  Furniture|            250.0|
+-----------+-----------------+



In [0]:
# Save the cleaned and aggregated data into a new JSON file.
df_cleaned_product.coalesce(1).write.json('/FileStore/cleaned_product_data.json')

DELTA

In [0]:
# Convert DataFrame to Delta Table
df_csv.write.format("delta").mode("overwrite").save("/dbfs/FileStore/delta/employee_delta")

df.write.format("delta").mode("overwrite").save("/dbfs/FileStore/delta/product_delta")

In [0]:
# Read Delta Table
df_delta = spark.read.format("delta").load("/dbfs/FileStore/delta/employee_delta")
df_delta.show()

df_delta = spark.read.format("delta").load("/dbfs/FileStore/delta/product_delta")
df_delta.show()

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+



In [0]:
# Update Delta Table
spark.sql("CREATE TABLE IF NOT EXISTS employee_delta USING DELTA LOCATION '/dbfs/FileStore/delta/employee_delta'")
# Increase salary by 5% for IT department employees
spark.sql("UPDATE employee_delta SET Salary = Salary * 1.05 WHERE Department = 'IT'")
employee_delta = spark.table("employee_delta")
employee_delta.show()

spark.sql("CREATE TABLE IF NOT EXISTS product_delta USING DELTA LOCATION '/dbfs/FileStore/delta/product_delta'")
# Delete products where stock is less than 40
spark.sql("DELETE FROM product_delta WHERE Stock < 40")
product_delta = spark.table("product_delta")
product_delta.show()

+----------+------------+----------+-----------+----------+
|EmployeeID|        Name|Department|JoiningDate|    Salary|
+----------+------------+----------+-----------+----------+
|      1001|    John Doe|        HR| 2021-01-15|     55000|
|      1005|David Wilson|        IT| 2021-06-25|70499.3625|
+----------+------------+----------+-----------+----------+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+



In [0]:
# Time Travel - Query a Previous Version
df_version = spark.read.format("delta").option("versionAsOf",
0).load("/dbfs/FileStore/delta/employee_delta")
df_version.show()

df_version = spark.read.format("delta").option("versionAsOf",
0).load("/dbfs/FileStore/delta/product_delta")
df_version.show()

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+



In [0]:
# Query the Delta table
df_finance_employees = spark.sql("SELECT * FROM employee_delta WHERE Department = 'Finance'")
df_finance_employees.show()

df_expensive_electronics = spark.sql("SELECT * FROM product_delta WHERE Category = 'Electronics' AND Price > 500")
df_expensive_electronics.show()

+----------+----+----------+-----------+------+
|EmployeeID|Name|Department|JoiningDate|Salary|
+----------+----+----------+-----------+------+
+----------+----+----------+-----------+------+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics|  800|      102| Smartphone|   80|
+-----------+-----+---------+-----------+-----+

