 Assignment 1: Working with CSV Data (employee_data.csv)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, avg, sum, count
# Initialize Spark Session
spark = SparkSession.builder.appName("DataProcessing").getOrCreate()


# 1. Load the CSV data
df_employee = spark.read.csv("file:/Workspace/Shared/employee_data.csv", header=True, inferSchema=True)
print("First 10 rows of employee data:")
df_employee.show(10)
print("Employee data schema:")
df_employee.printSchema()

# 2. Data Cleaning
# Remove rows where the Salary is less than 55,000.
# Filter the employees who joined after the year 2020.

df_employee_cleaned = df_employee.filter(col("Salary") >= 55000) \
  .filter(year(col("JoiningDate")) > 2020)

# 3. Data Aggregation
# Find the average salary by Department.

avg_salary_by_dept = df_employee_cleaned.groupBy("Department") \
  .agg(avg("Salary").alias("AvgSalary"))
print("Average salary by department:")
avg_salary_by_dept.show()

# Count the number of employees in each Department.

employee_count_by_dept = df_employee_cleaned.groupBy("Department") \
  .agg(count("EmployeeID").alias("EmployeeCount"))
print("Employee count by department:")
employee_count_by_dept.show()

# 4. Write the Data to CSV
# Save the cleaned data (from the previous steps) to a new CSV file.

df_employee_cleaned.write.csv("/Workspace/Shared/cleaned_employee_data.csv", header=True, mode="overwrite")



First 10 rows of employee data:
+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

Employee data schema:
root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Salary: integer (nullable = true)

Average salary by department:
+----------+---------+
|

Assignment 2: Working with JSON Data (product_data.json)

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

dbutils.fs.cp("file:/Workspace/Shared/product_data.json", "dbfs:/FileStore/product_data.json")

product_df = spark.read.option("multiline", "true").json("/FileStore/product_data.json")

# First 10 rows
product_df.show(10)

# Data Cleaning
# Remove rows where stock is less than 30
product_df_cleaned = product_df.filter(col("Stock") >= 30)

# Filter products in Electronics category
df_electronics = product_df_cleaned.filter(col("Category") == "Electronics")
df_electronics.show()

# Data Aggregation
#  Calculate the total stock for products in the "Furniture" category.
df_furniture = product_df.filter(col("Category") == "Furniture").agg(F.sum("Stock").alias("TotalFurnitureStock"))
df_furniture.show()

#  Find the average price of all products in the dataset.
df_avg = product_df.agg(F.avg("Price").alias("AveragePrice"))
df_avg.show()

# Write to a Json file
df_electronics.write.format("json").mode("overwrite").save("file:/Workspace/Shared/product_data.json")


+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|Electronics|  300|      104|    Monitor|   45|
+-----------+-----+---------+-----------+-----+

+-------------------+
|TotalFurnitureStock|
+-------------------+
|                 85|
+-------------------+

+------------+
|AveragePrice|
+------------+
|       560.0|
+------------+



Assignment 3: Working with Delta Tables

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

dbutils.fs.cp("file:/Workspace/Shared/employee_data.csv", "dbfs:/FileStore/employee_data.csv")

product_df = spark.read.option("multiline", "true").json("/FileStore/product_data.json") 

employee_df = spark.read.format("csv").option("header", "true").load("/FileStore/employee_data.csv")

# Converting CSV and JSON to delta format

employee_df.write.format("delta").mode("overwrite").save("/Workspace/Shared/employee_table")

product_df.write.format("delta").mode("overwrite").save("/Workspace/Shared/product_table")

# Register Delta Tables

# employee_df.write.saveAsTable("employee_table")
# product_df.write.saveAsTable("products_table")

employee_delta_path = "/Workspace/Shared/employee_table"
product_delta_path = "/Workspace/Shared/product_table"

spark.sql(f"CREATE TABLE IF NOT EXISTS employee_delta USING DELTA LOCATION '{employee_delta_path}'")
spark.sql(f"CREATE TABLE IF NOT EXISTS product_delta USING DELTA LOCATION '{product_delta_path}'")

# Data modification

# Increase the salary by 5% for all employees in the IT department.
spark.sql("update employee_delta set Salary = Salary * 1.05 where Department = 'IT'")

#  Delete products where the stock is less than 40.
spark.sql("delete from product_delta where Stock < 40")

# Time Travel 

print("Products before delete operation")
spark.sql("SELECT * FROM product_delta VERSION AS OF 0").show()

print("Employees before update operation")
spark.sql("SELECT * FROM employee_delta VERSION AS OF 0").show()

# Query Delta Tables

# Query employees in finance department
spark.sql("SELECT * FROM employee_delta WHERE Department = 'Finance'").show()

# Query Electronics products with price > 500
spark.sql("SELECT * FROM product_delta WHERE Category = 'Electronics' AND Price > 500").show()

Products before delete operation
+-----------+-----+---------+-----------+-----+
|   Category|Price|ProductID|ProductName|Stock|
+-----------+-----+---------+-----------+-----+
|Electronics| 1200|      101|     Laptop|   35|
|Electronics|  800|      102| Smartphone|   80|
|  Furniture|  150|      103| Desk Chair|   60|
|Electronics|  300|      104|    Monitor|   45|
|  Furniture|  350|      105|       Desk|   25|
+-----------+-----+---------+-----------+-----+

Employees before update operation
+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-1