In [0]:
## 1. Load the CSV data:
# Load the employee_data.csv file into a DataFrame.
df = spark.read.csv("file:/Workspace/Shared/Employee_data.csv", header=True, inferSchema=True)
# Display the first 10 rows and inspect the schema.
df.show(10)
df.printSchema()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoiningDate: date (nullable = true)
 |-- Salary: integer (nullable = true)



In [0]:
## 2. Data Cleaning:
from pyspark.sql.functions import col, year
# Remove rows where the Salary is less than 55,000.
cleaned_df = df.filter(col("Salary") >= 55000)
cleaned_df.show()
# Filter the employees who joined after the year 2020.
cleaned_df = cleaned_df.filter(year(col("JoiningDate")) > 2020)
cleaned_df.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
+----------+-------------+----------+-----------+------+

+----------+------------+----------+-----------+------+
|EmployeeID|        Name|Department|JoiningDate|Salary|
+----------+------------+----------+-----------+------+
|      1001|    John Doe|        HR| 2021-01-15| 55000|
|      1005|David Wilson|        IT| 2021-06-25| 58000|
+----------+------------+----------+-----------+------+



In [0]:
## 3. Data Aggregation
from pyspark.sql.functions import avg
# Find the average salary by Department.
df.groupBy("Department").agg(avg("Salary").alias("AverageSalary"))
df.show()
# Count the number of employees in each Department.
Employees_df = df.groupBy("Department").count()
Employees_df.show()

+----------+-------------+----------+-----------+------+
|EmployeeID|         Name|Department|JoiningDate|Salary|
+----------+-------------+----------+-----------+------+
|      1001|     John Doe|        HR| 2021-01-15| 55000|
|      1002|   Jane Smith|        IT| 2020-03-10| 62000|
|      1003|Emily Johnson|   Finance| 2019-07-01| 70000|
|      1004|Michael Brown|        HR| 2018-12-22| 54000|
|      1005| David Wilson|        IT| 2021-06-25| 58000|
|      1006|  Linda Davis|   Finance| 2020-11-15| 67000|
|      1007| James Miller|        IT| 2019-08-14| 65000|
|      1008|Barbara Moore|        HR| 2021-03-29| 53000|
+----------+-------------+----------+-----------+------+

+----------+-----+
|Department|count|
+----------+-----+
|        HR|    3|
|   Finance|    2|
|        IT|    3|
+----------+-----+



In [0]:
# 4. Write the Data to CSV
# Save the cleaned data (from the previous steps) to a new CSV file.
cleaned_df.write.csv("Cleaned_data.csv")