In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("HRAnalytics").getOrCreate()

In [17]:
from google.colab import drive
drive.mount('/content/drive')
employees_df = spark.read.csv("/content/drive/MyDrive/employees.csv", header=True, inferSchema=True)
attendance_df= spark.read.csv("/content/drive/MyDrive/attendance.csv", header=True, inferSchema=True)
bonuses_df = spark.read.json("/content/drive/MyDrive/bonuses.json", multiLine=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1. Ingestion & Exploration

In [18]:
employees_df.printSchema()
employees_df.show()
attendance_df.printSchema()
attendance_df.show()
bonuses_df.printSchema()
bonuses_df.show()

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: double (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|      1.0|
|    3|Simran|Engineering|2022-07-10| 75000|      1.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|      1.0|
|    5| Nisha|         HR|2023-01-05| 50000|      1.0|
+-----+------+-----------+----------+------+---------+

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-0

In [5]:
employees_df.select("Department").distinct().count()

3

2. DataFrame Operations

In [6]:
from pyspark.sql.functions import current_date, datediff, round
employees_df = employees_df.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate")) / 365, 2))

In [7]:
employees_with_bonus = employees_df.join(bonuses_df, "EmpID", "left")
employees_with_bonus = employees_with_bonus.withColumn("TotalCompensation", col("Salary") + col("Bonus"))


In [8]:
employees_with_bonus.filter(col("TenureYears") > 2).show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|      1.0|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|      1.0|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|      1.0|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|      1.0|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [9]:
employees_with_bonus.filter(col("ManagerID").isNotNull()).show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    2|   Raj|Engineering|2020-03-15| 80000|      1.0|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|      1.0|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|      1.0|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|      1.0|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



3. Aggregation

In [11]:
employees_df.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()


+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+



In [12]:
employees_df.groupBy("ManagerID").count().show()

+---------+-----+
|ManagerID|count|
+---------+-----+
|     NULL|    1|
|      1.0|    4|
+---------+-----+



In [19]:
attendance_df.filter(col("Status") == "Absent") \
    .groupBy("EmpID").count().withColumnRenamed("count", "AbsenceCount").show()

+-----+------------+
|EmpID|AbsenceCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



4. Joins

In [20]:
attendance_summary = attendance_df.groupBy("EmpID", "Status").count() \
    .groupBy("EmpID") \
    .pivot("Status", ["Present", "Absent"]) \
    .sum("count") \
    .fillna(0)

In [22]:
attendance_summary = attendance_summary.withColumn("AttendancePercent", round(col("Present") / (col("Present") + col("Absent")) * 100, 2))
attendance_summary.show()


+-----+-------+------+-----------------+
|EmpID|Present|Absent|AttendancePercent|
+-----+-------+------+-----------------+
|    1|      2|     0|            100.0|
|    3|      2|     0|            100.0|
|    5|      2|     0|            100.0|
|    4|      0|     2|              0.0|
|    2|      1|     1|             50.0|
+-----+-------+------+-----------------+



In [23]:
employees_with_bonus.orderBy(col("TotalCompensation").desc()).select("EmpID", "Name", "TotalCompensation").show(3)

+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
+-----+------+-----------------+
only showing top 3 rows



In [24]:
multi_join_df = employees_df.join(bonuses_df, "EmpID").join(attendance_summary, "EmpID")
multi_join_df.show()

+-----+------+-----------+----------+------+---------+-----+----+-------+------+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|Bonus|Year|Present|Absent|AttendancePercent|
+-----+------+-----------+----------+------+---------+-----+----+-------+------+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL| 5000|2023|      2|     0|            100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|      1.0| 7000|2023|      1|     1|             50.0|
|    3|Simran|Engineering|2022-07-10| 75000|      1.0| 6500|2023|      2|     0|            100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|      1.0| 6000|2023|      0|     2|              0.0|
|    5| Nisha|         HR|2023-01-05| 50000|      1.0| 4000|2023|      2|     0|            100.0|
+-----+------+-----------+----------+------+---------+-----+----+-------+------+-----------------+



5. String & Date Functions

In [25]:
employees_df = employees_df.withColumn("JoinYear", year("JoinDate")) \
                           .withColumn("JoinMonth", month("JoinDate"))


In [26]:
employees_df = employees_df.withColumn("MaskedName", regexp_replace("Name", ".", "*"))

In [27]:
employees_df = employees_df.withColumn("EmpCode", concat(lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))
employees_df.select("EmpID", "Name", "MaskedName", "EmpCode").show()

+-----+------+----------+-------+
|EmpID|  Name|MaskedName|EmpCode|
+-----+------+----------+-------+
|    1| Anita|     *****| EMP001|
|    2|   Raj|       ***| EMP002|
|    3|Simran|    ******| EMP003|
|    4| Aamir|     *****| EMP004|
|    5| Nisha|     *****| EMP005|
+-----+------+----------+-------+



6. Conditional & Null Handling

In [28]:
bonuses_df = bonuses_df.withColumn("Performance", when(col("Bonus") > 6000, "High")
                                   .when(col("Bonus").between(4000, 6000), "Medium")
                                   .otherwise("Low"))



In [29]:
employees_df = employees_df.fillna({"ManagerID": "No Manager"})


7. Spark SQL

In [31]:
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.catalog.setCurrentDatabase("hr")
employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")


In [32]:
spark.sql("""
SELECT Department, Name, MAX(Salary) as TopSalary
FROM employees
GROUP BY Department, Name
ORDER BY Department
""").show()

spark.sql("""
SELECT e.Department, ROUND(AVG(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END)/COUNT(*), 2) AS AttendanceRate
FROM employees e
JOIN attendance a ON e.EmpID = a.EmpID
GROUP BY e.Department
""").show()

spark.sql("""
SELECT * FROM employees
WHERE JoinDate > '2021-01-01' AND Salary > 70000
""").show()

+-----------+------+---------+
| Department|  Name|TopSalary|
+-----------+------+---------+
|Engineering|   Raj|    80000|
|Engineering|Simran|    75000|
|         HR| Nisha|    50000|
|         HR| Anita|    55000|
|  Marketing| Aamir|    60000|
+-----------+------+---------+

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          0.19|
|         HR|          0.25|
|  Marketing|           0.0|
+-----------+--------------+

+-----+------+-----------+----------+------+---------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+--------+---------+----------+-------+
|    3|Simran|Engineering|2022-07-10| 75000|      1.0|    2022|        7|    ******| EMP003|
+-----+------+-----------+----------+------+---------+--------+---------+----------+-------+



8. Advanced

In [33]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
def classify_dept(dept):
    return "Tech" if dept == "Engineering" else "Non-Tech"

classify_dept_udf = udf(classify_dept, StringType())

employees_df = employees_df.withColumn("DeptType", classify_dept_udf("Department"))


In [34]:
emp_attendance_summary = multi_join_df.select("EmpID", "Name", "Department", "AttendancePercent")
emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")

emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet("/content/emp_attendance_summary")