In [5]:
from google.colab import files
uploaded = files.upload()

Saving bonuses.json to bonuses.json
Saving attendance.csv to attendance.csv
Saving employees (1).csv to employees (1).csv


In [6]:
# Renaming
!mv 'employees (1).csv' employees.csv

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month, regexp_replace, concat_ws, lpad, substring, lit, col, when, round, datediff, current_date, count, sum as _sum

spark = SparkSession.builder.appName("HRAnalytics").getOrCreate()

# Load the datasets
employees_df = spark.read.option("header", "true").option("inferSchema", "true").csv("employees.csv")
attendance_df = spark.read.option("header", "true").option("inferSchema", "true").csv("attendance.csv")
bonuses_df = spark.read.option("multiline", "true").json("bonuses.json")

**Task 1: Ingestion & Exploration**

In [8]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("HRAnalytics").getOrCreate()

# Read employees.csv
employees_df = spark.read.option("header", "true").option("inferSchema", "true").csv("employees.csv")

# Read attendance.csv
attendance_df = spark.read.option("header", "true").option("inferSchema", "true").csv("attendance.csv")

# Read bonuses.json
bonuses_df = spark.read.option("multiline", "true").json("bonuses.json")

# Show schemas
print(" Employees Schema:")
employees_df.printSchema()

print("\n Attendance Schema:")
attendance_df.printSchema()

print("\n Bonuses Schema:")
bonuses_df.printSchema()

# Show sample records
print("\n Sample Employees Data:")
employees_df.show()

print("\n Sample Attendance Data:")
attendance_df.show()

print("\n Sample Bonuses Data:")
bonuses_df.show()

# Count distinct departments
print("\n Distinct Departments Count:")
employees_df.select("Department").distinct().show()


 Employees Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: double (nullable = true)


 Attendance Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)


 Bonuses Schema:
root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)


 Sample Employees Data:
+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|      1.0|
|    3|Simran|Engineering|2022-07-10| 75000|      1.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|      1.0|
|    5| Nisha|         HR|2023-01-05| 50000|      1.0|
+-----+----

**Task 2: DataFrame Operations**

In [9]:
from pyspark.sql.functions import datediff, current_date, round, col

#  1. Add a column TenureYears using datediff() and round()
employees_with_tenure = employees_df.withColumn(
    "TenureYears",
    round(datediff(current_date(), col("JoinDate")) / 365, 2)
)

print(" Tenure (in years):")
employees_with_tenure.select("EmpID", "Name", "JoinDate", "TenureYears").show()

#  2. Join with bonuses to calculate TotalCompensation = Salary + Bonus
emp_bonus_df = employees_with_tenure.join(bonuses_df, on="EmpID", how="left")
emp_with_comp = emp_bonus_df.withColumn("TotalCompensation", col("Salary") + col("Bonus"))

print(" Employees with Total Compensation:")
emp_with_comp.select("EmpID", "Name", "Salary", "Bonus", "TotalCompensation").show()

#  3. Filter employees with more than 2 years in the company
print(" Employees with > 2 years tenure:")
emp_with_comp.filter(col("TenureYears") > 2).select("EmpID", "Name", "TenureYears").show()

#  4. Show employees who report to a manager (ManagerID is not null)
print(" Employees who report to a manager:")
emp_with_comp.filter(col("ManagerID").isNotNull()).select("EmpID", "Name", "ManagerID").show()


 Tenure (in years):
+-----+------+----------+-----------+
|EmpID|  Name|  JoinDate|TenureYears|
+-----+------+----------+-----------+
|    1| Anita|2021-05-01|       4.11|
|    2|   Raj|2020-03-15|       5.24|
|    3|Simran|2022-07-10|       2.92|
|    4| Aamir|2019-11-20|       5.56|
|    5| Nisha|2023-01-05|       2.43|
+-----+------+----------+-----------+

 Employees with Total Compensation:
+-----+------+------+-----+-----------------+
|EmpID|  Name|Salary|Bonus|TotalCompensation|
+-----+------+------+-----+-----------------+
|    1| Anita| 55000| 5000|            60000|
|    2|   Raj| 80000| 7000|            87000|
|    3|Simran| 75000| 6500|            81500|
|    4| Aamir| 60000| 6000|            66000|
|    5| Nisha| 50000| 4000|            54000|
+-----+------+------+-----+-----------------+

 Employees with > 2 years tenure:
+-----+------+-----------+
|EmpID|  Name|TenureYears|
+-----+------+-----------+
|    1| Anita|       4.11|
|    2|   Raj|       5.24|
|    3|Simran|   

**Task 3: Aggregation**

In [10]:
from pyspark.sql.functions import avg, count

# 1. Average salary per department
print(" Average Salary by Department:")
employees_df.groupBy("Department") \
    .agg(avg("Salary").alias("AverageSalary")) \
    .show()

#  2. Number of employees under each manager
print(" Employees under each Manager:")
employees_df.groupBy("ManagerID") \
    .agg(count("EmpID").alias("NumEmployees")) \
    .filter(col("ManagerID").isNotNull()) \
    .show()

#  3. Count of absences per employee
print(" Absences per Employee:")
attendance_df.filter(col("Status") == "Absent") \
    .groupBy("EmpID") \
    .agg(count("Status").alias("AbsentDays")) \
    .show()


 Average Salary by Department:
+-----------+-------------+
| Department|AverageSalary|
+-----------+-------------+
|Engineering|      77500.0|
|         HR|      52500.0|
|  Marketing|      60000.0|
+-----------+-------------+

 Employees under each Manager:
+---------+------------+
|ManagerID|NumEmployees|
+---------+------------+
|      1.0|           4|
+---------+------------+

 Absences per Employee:
+-----+----------+
|EmpID|AbsentDays|
+-----+----------+
|    4|         2|
|    2|         1|
+-----+----------+



**Task 4: Joins**

In [11]:
from pyspark.sql.functions import when, count, sum as _sum, round

#  1. Join employees and attendance → Get attendance % (Present days / Total days)
attendance_summary = attendance_df.groupBy("EmpID") \
    .agg(
        count("Status").alias("TotalDays"),
        _sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
    ) \
    .withColumn("AttendancePercentage", round((col("PresentDays") / col("TotalDays")) * 100, 2))

print(" Attendance % per Employee:")
attendance_summary.show()

#  2. Join with employees to include employee details
emp_attendance_df = employees_df.join(attendance_summary, on="EmpID", how="left")

#  3. Join with bonuses → Show top 3 employees by TotalCompensation
emp_bonus_df = emp_attendance_df.join(bonuses_df, on="EmpID", how="left")
emp_bonus_df = emp_bonus_df.withColumn("TotalCompensation", col("Salary") + col("Bonus"))

print(" Top 3 Employees by Total Compensation:")
emp_bonus_df.select("EmpID", "Name", "Salary", "Bonus", "TotalCompensation") \
    .orderBy(col("TotalCompensation").desc()) \
    .show(3)

#  4. Multi-level join: employees + bonuses + attendance
print(" Full Multi-Join Result:")
emp_bonus_df.select("EmpID", "Name", "Department", "TotalDays", "PresentDays", "AttendancePercentage", "TotalCompensation").show()


 Attendance % per Employee:
+-----+---------+-----------+--------------------+
|EmpID|TotalDays|PresentDays|AttendancePercentage|
+-----+---------+-----------+--------------------+
|    1|        2|          2|               100.0|
|    3|        2|          2|               100.0|
|    5|        2|          2|               100.0|
|    4|        2|          0|                 0.0|
|    2|        2|          1|                50.0|
+-----+---------+-----------+--------------------+

 Top 3 Employees by Total Compensation:
+-----+------+------+-----+-----------------+
|EmpID|  Name|Salary|Bonus|TotalCompensation|
+-----+------+------+-----+-----------------+
|    2|   Raj| 80000| 7000|            87000|
|    3|Simran| 75000| 6500|            81500|
|    4| Aamir| 60000| 6000|            66000|
+-----+------+------+-----+-----------------+
only showing top 3 rows

 Full Multi-Join Result:
+-----+------+-----------+---------+-----------+--------------------+-----------------+
|EmpID|  Nam

**Task 5: String & Date Functions**

In [14]:
from pyspark.sql.functions import year, month, regexp_replace, concat_ws, lpad, substring

#  1. Extract year and month from JoinDate
print(" Year and Month from JoinDate:")
employees_df.withColumn("JoinYear", year("JoinDate")) \
    .withColumn("JoinMonth", month("JoinDate")) \
    .select("EmpID", "Name", "JoinDate", "JoinYear", "JoinMonth") \
    .show()

#  2. Mask employee names using regex (e.g., A***)
print(" Masked Employee Names:")
masked_names_df = employees_df.withColumn("MaskedName", regexp_replace("Name", r"(?<=^.).", "*"))
masked_names_df.select("EmpID", "Name", "MaskedName").show()

#  3. Use substring() to create EmpCode like "EMP001"
print(" Employee Codes:")
emp_code_df = employees_df.withColumn("EmpCode", concat_ws("", lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))
emp_code_df.select("EmpID", "Name", "EmpCode").show()


 Year and Month from JoinDate:
+-----+------+----------+--------+---------+
|EmpID|  Name|  JoinDate|JoinYear|JoinMonth|
+-----+------+----------+--------+---------+
|    1| Anita|2021-05-01|    2021|        5|
|    2|   Raj|2020-03-15|    2020|        3|
|    3|Simran|2022-07-10|    2022|        7|
|    4| Aamir|2019-11-20|    2019|       11|
|    5| Nisha|2023-01-05|    2023|        1|
+-----+------+----------+--------+---------+

 Masked Employee Names:
+-----+------+----------+
|EmpID|  Name|MaskedName|
+-----+------+----------+
|    1| Anita|     A*ita|
|    2|   Raj|       R*j|
|    3|Simran|    S*mran|
|    4| Aamir|     A*mir|
|    5| Nisha|     N*sha|
+-----+------+----------+

 Employee Codes:
+-----+------+-------+
|EmpID|  Name|EmpCode|
+-----+------+-------+
|    1| Anita| EMP001|
|    2|   Raj| EMP002|
|    3|Simran| EMP003|
|    4| Aamir| EMP004|
|    5| Nisha| EMP005|
+-----+------+-------+



**Task 6: Conditional & Null Handling**

In [15]:
from pyspark.sql.functions import when

# 1. Use when/otherwise to label performance based on Bonus:
performance_df = emp_bonus_df.withColumn(
    "Performance",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)

print(" Employee Performance Labels:")
performance_df.select("EmpID", "Name", "Bonus", "Performance").show()

# 2. Handle missing ManagerID by filling nulls with "No Manager"
# ManagerID is numeric, so convert nulls to a string label in a new column for display

performance_df = performance_df.withColumn(
    "ManagerID_Display",
    when(col("ManagerID").isNull(), "No Manager").otherwise(col("ManagerID").cast("string"))
)

print(" Manager Info with Null Handling:")
performance_df.select("EmpID", "Name", "ManagerID", "ManagerID_Display").show()


 Employee Performance Labels:
+-----+------+-----+-----------+
|EmpID|  Name|Bonus|Performance|
+-----+------+-----+-----------+
|    1| Anita| 5000|     Medium|
|    2|   Raj| 7000|       High|
|    3|Simran| 6500|       High|
|    4| Aamir| 6000|     Medium|
|    5| Nisha| 4000|     Medium|
+-----+------+-----+-----------+

 Manager Info with Null Handling:
+-----+------+---------+-----------------+
|EmpID|  Name|ManagerID|ManagerID_Display|
+-----+------+---------+-----------------+
|    1| Anita|     NULL|       No Manager|
|    2|   Raj|      1.0|              1.0|
|    3|Simran|      1.0|              1.0|
|    4| Aamir|      1.0|              1.0|
|    5| Nisha|      1.0|              1.0|
+-----+------+---------+-----------------+



**Task 7: Spark SQL**

In [16]:
# 1. Create and use database hr
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

# 2. Save DataFrames as tables in the hr database
employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")

# 3. Write and run SQL queries:

# a. Top paid employee in each department
print(" Top Paid Employee in Each Department:")
spark.sql("""
    SELECT Department, Name, Salary
    FROM (
        SELECT *, ROW_NUMBER() OVER (PARTITION BY Department ORDER BY Salary DESC) AS rn
        FROM employees
    )
    WHERE rn = 1
""").show()

# b. Attendance rate by department
print(" Attendance Rate by Department:")
spark.sql("""
    SELECT e.Department,
           ROUND(SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS AttendanceRate
    FROM employees e
    JOIN attendance a ON e.EmpID = a.EmpID
    GROUP BY e.Department
""").show()

# c. Employees joined after 2021 with salary > 70,000
print(" Employees Joined After 2021 with Salary > 70,000:")
spark.sql("""
    SELECT Name, Department, JoinDate, Salary
    FROM employees
    WHERE JoinDate > '2021-12-31' AND Salary > 70000
""").show()


 Top Paid Employee in Each Department:
+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|Engineering|  Raj| 80000|
|         HR|Anita| 55000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

 Attendance Rate by Department:
+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|         75.00|
|         HR|        100.00|
|  Marketing|          0.00|
+-----------+--------------+

 Employees Joined After 2021 with Salary > 70,000:
+------+-----------+----------+------+
|  Name| Department|  JoinDate|Salary|
+------+-----------+----------+------+
|Simran|Engineering|2022-07-10| 75000|
+------+-----------+----------+------+



**Task 8: Advanced (Optional)**


In [17]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define UDF
def classify_department(dept):
    return "Tech" if dept in ["Engineering", "IT"] else "Non-Tech"

# Register UDF
classify_udf = udf(classify_department, StringType())

# Apply to employees_df
classified_df = employees_df.withColumn("DeptType", classify_udf(col("Department")))

print(" Department Classification:")
classified_df.select("EmpID", "Name", "Department", "DeptType").show()


 Department Classification:
+-----+------+-----------+--------+
|EmpID|  Name| Department|DeptType|
+-----+------+-----------+--------+
|    1| Anita|         HR|Non-Tech|
|    2|   Raj|Engineering|    Tech|
|    3|Simran|Engineering|    Tech|
|    4| Aamir|  Marketing|Non-Tech|
|    5| Nisha|         HR|Non-Tech|
+-----+------+-----------+--------+



In [21]:
from pyspark.sql.functions import count, when, col, sum as _sum

# Join to get attendance count
summary_df = attendance_df.groupBy("EmpID") \
    .agg(
        count("*").alias("TotalDays"),
        _sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
    )



In [23]:
from pyspark.sql.functions import count, when, col, sum as _sum

# Step 1: Recalculate attendance summary
summary_df = attendance_df.groupBy("EmpID") \
    .agg(
        count("*").alias("TotalDays"),
        _sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
    )

# Step 2: Join with employees DataFrame
emp_attendance_summary = employees_df.join(summary_df, on="EmpID", how="left")

# Step 3: Save as Parquet partitioned by Department
emp_attendance_summary.write \
    .mode("overwrite") \
    .partitionBy("Department") \
    .parquet("/content/emp_attendance_summary_parquet")

print(" emp_attendance_summary saved as Parquet partitioned by Department.")


 emp_attendance_summary saved as Parquet partitioned by Department.


In [24]:
df_parquet = spark.read.parquet("/content/emp_attendance_summary_parquet")
df_parquet.show()

+-----+------+----------+------+---------+---------+-----------+-----------+
|EmpID|  Name|  JoinDate|Salary|ManagerID|TotalDays|PresentDays| Department|
+-----+------+----------+------+---------+---------+-----------+-----------+
|    2|   Raj|2020-03-15| 80000|      1.0|        2|          1|Engineering|
|    3|Simran|2022-07-10| 75000|      1.0|        2|          2|Engineering|
|    1| Anita|2021-05-01| 55000|     NULL|        2|          2|         HR|
|    5| Nisha|2023-01-05| 50000|      1.0|        2|          2|         HR|
|    4| Aamir|2019-11-20| 60000|      1.0|        2|          0|  Marketing|
+-----+------+----------+------+---------+---------+-----------+-----------+

