In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


spark = SparkSession.builder.getOrCreate()

employees_df = spark.read.csv("employees.csv",header=True, inferSchema=True)
attendance_df = spark.read.csv("attendance.csv", header=True,inferSchema=True)
bonuses_df = spark.read.json("bonuses.json")

#Show schemas and sample records

print("Employees Schema:")
employees_df.printSchema()
employees_df.show()

print("Attendance Schema:")
attendance_df.printSchema()
attendance_df.show()

print("Bonuses Schema:")
bonuses_df.printSchema()
bonuses_df.show()

#Count distinct departments
print("Distinct departments:",employees_df.select("Department").distinct().count())



Employees Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

Attendance Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
| 

In [28]:
# 2.DataFrame Operations
from pyspark.sql.functions import to_date, datediff, current_date, round, col

#  Ensure JoinDate is in proper date format
# employees_df = employees_df.withColumn("JoinDate", to_date(col("JoinDate")))

# Add TenureYears using datediff() and round()
employees_df = employees_df.withColumn(
    "TenureYears", round(datediff(current_date(), col("JoinDate")) / 365, 2)
)

#  Calculate TotalCompensation = Salary + Bonus

emp_bonus = employees_df.join(bonuses_df, on="EmpID", how="left")
emp_bonus = emp_bonus.withColumn(
    "TotalCompensation",col("Salary") + col("Bonus")
)

# Filter employees with more than 2 years in the company
emp_bonus.filter(col("TenureYears") > 2).show()

# Show employees who report to a manager (ManagerID is not null)
manager = emp_bonus.filter(col("ManagerID").isNotNull())
manager.select("EmpID","Name").show()


+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|_corrupt_record|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|           NULL|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|           NULL|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|           NULL|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|           NULL|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|           NULL|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+-----------

In [30]:
# 3. Aggregation
# Average salary per department
from pyspark.sql.functions import avg

employees_df.groupBy("Department").agg(round(avg("Salary"), 2).alias("AvgSalary")).show()

# Number of employees under each manager
from pyspark.sql.functions import count

employees_df.groupBy("ManagerID").agg( count("EmpID").alias("Count")).filter("ManagerID IS NOT NULL").show()

# Count of absences per employee
from pyspark.sql.functions import count, when
absences_df = attendance_df.filter(col("Status") == "Absent")

absences_df.groupBy("EmpID").agg(count("*").alias("Days")
).show()


+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+

+---------+-----+
|ManagerID|Count|
+---------+-----+
|        1|    4|
+---------+-----+

+-----+----+
|EmpID|Days|
+-----+----+
|    4|   2|
|    2|   1|
+-----+----+



In [29]:
#  4. Joins
# Join employees and attendance → Get attendance % (Present days / Total days).

from pyspark.sql.functions import count, when, col, round

attendance_data = attendance_df.groupBy("EmpID").agg(
    count("*").alias("TotalDays"),
    count(when(col("Status") == "Present", True)).alias("PresentDays")
).withColumn(
    "AttendancePercent", round((col("PresentDays") / col("TotalDays")) * 100, 2)
)

emp_attendance = employees_df.join(attendance_data, on="EmpID", how="left")
emp_attendance.select("EmpID", "Name", "Department", "AttendancePercent").show()


+-----+------+-----------+-----------------+
|EmpID|  Name| Department|AttendancePercent|
+-----+------+-----------+-----------------+
|    1| Anita|         HR|            100.0|
|    2|   Raj|Engineering|             50.0|
|    3|Simran|Engineering|            100.0|
|    4| Aamir|  Marketing|              0.0|
|    5| Nisha|         HR|            100.0|
+-----+------+-----------+-----------------+



In [11]:
# Join employees and bonuses -> Show top 3 employees by TotalCompensatio

emp_bonus = employees_df.join(bonuses_df, on="EmpID", how="left")

emp_bonus = emp_bonus.withColumn("TotalCompensation", col("Salary") + col("Bonus"))

emp_bonus.orderBy(col("TotalCompensation").desc()).select(
    "EmpID", "Name", "TotalCompensation").show(3)


+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
+-----+------+-----------------+
only showing top 3 rows



In [12]:
#Multi-level join: employees + bonuses + attendance
multi_join = employees_df.join(bonuses_df, on="EmpID", how="left") \
    .join(attendance_data, on="EmpID", how="left")

multi_join = multi_join.withColumn("TotalCompensation", col("Salary") + col("Bonus"))

multi_join.select("EmpID", "Name", "Department", "TotalCompensation", "AttendancePercent").show()


+-----+------+-----------+-----------------+-----------------+
|EmpID|  Name| Department|TotalCompensation|AttendancePercent|
+-----+------+-----------+-----------------+-----------------+
|    1| Anita|         HR|            60000|            100.0|
|    2|   Raj|Engineering|            87000|             50.0|
|    3|Simran|Engineering|            81500|            100.0|
|    4| Aamir|  Marketing|            66000|              0.0|
|    5| Nisha|         HR|            54000|            100.0|
+-----+------+-----------+-----------------+-----------------+



In [34]:
from pyspark.sql.functions import year, month, substring, concat, lit, lpad, col

# 1. Extract year and month from JoinDate
date_info = employees_df.withColumn("Year", year("JoinDate")) .withColumn("Month", month("JoinDate"))

date_info.select("EmpID", "Name", "JoinDate", "Year", "Month").show()

# 2. Mask employee names (e.g., "Anita" → "A****")
emp_mask = date_info.withColumn("MaskedName", concat(substring("Name", 1, 1), lit("****")))
emp_mask.select("EmpID", "Name", "MaskedName").show()

# 3. Use substring/lpad to create EmpCode like "EMP001"
emp_code = emp_mask.withColumn( "EmpCode", concat(lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))
emp_code.select("EmpID", "EmpCode", "Name").show()


+-----+------+----------+----+-----+
|EmpID|  Name|  JoinDate|Year|Month|
+-----+------+----------+----+-----+
|    1| Anita|2021-05-01|2021|    5|
|    2|   Raj|2020-03-15|2020|    3|
|    3|Simran|2022-07-10|2022|    7|
|    4| Aamir|2019-11-20|2019|   11|
|    5| Nisha|2023-01-05|2023|    1|
+-----+------+----------+----+-----+

+-----+------+----------+
|EmpID|  Name|MaskedName|
+-----+------+----------+
|    1| Anita|     A****|
|    2|   Raj|     R****|
|    3|Simran|     S****|
|    4| Aamir|     A****|
|    5| Nisha|     N****|
+-----+------+----------+

+-----+-------+------+
|EmpID|EmpCode|  Name|
+-----+-------+------+
|    1| EMP001| Anita|
|    2| EMP002|   Raj|
|    3| EMP003|Simran|
|    4| EMP004| Aamir|
|    5| EMP005| Nisha|
+-----+-------+------+



In [35]:
# 6. Conditional & Null Handling
# Use when/otherwise to label performance:
# “High” if Bonus > 6000
# “Medium” if 4000–6000
# “Low” otherwise

emp_bonus_df = employees_df.join(bonuses_df, on="EmpID", how="left")

from pyspark.sql.functions import when, col

emp_performance = emp_bonus_df.withColumn(
    "PerformanceLabel",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)

emp_performance.select("EmpID", "Name", "Bonus", "PerformanceLabel").show()

# Handle missing ManagerID using fillna("No Manager")

from pyspark.sql.functions import col

missing = emp_performance.withColumn("ManagerID", col("ManagerID").cast("string")).fillna({"ManagerID": "No Manager"})

missing.select("EmpID", "Name", "ManagerID").show()



+-----+------+-----+----------------+
|EmpID|  Name|Bonus|PerformanceLabel|
+-----+------+-----+----------------+
|    1| Anita| 5000|          Medium|
|    2|   Raj| 7000|            High|
|    3|Simran| 6500|            High|
|    4| Aamir| 6000|          Medium|
|    5| Nisha| 4000|          Medium|
+-----+------+-----+----------------+

+-----+------+----------+
|EmpID|  Name| ManagerID|
+-----+------+----------+
|    1| Anita|No Manager|
|    2|   Raj|         1|
|    3|Simran|         1|
|    4| Aamir|         1|
|    5| Nisha|         1|
+-----+------+----------+



In [18]:
#  7. Spark SQL
# Create and use database hr
# Save all DataFrames as tables: employees , attendance , bonuses

spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")


In [40]:
# Write SQL queries:
# Top paid employee in each department.

spark.sql("""
SELECT Department, Name, Salary
FROM (SELECT Department, Name, Salary,
           RANK() OVER (PARTITION BY Department ORDER BY Salary DESC) as rank FROM employees
)
WHERE rank = 1
""").show()


+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|Engineering|  Raj| 80000|
|         HR|Anita| 55000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+



In [39]:
# Attendance rate by department.
spark.sql("""
SELECT e.Department,
       ROUND(SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) / COUNT(*), 2) AS AttendanceRate
FROM employees e
JOIN attendance a ON e.EmpID = a.EmpID
GROUP BY e.Department
""").show()


+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          0.75|
|         HR|           1.0|
|  Marketing|           0.0|
+-----------+--------------+



In [41]:
# Employees joined after 2021 with salary > 70,000
spark.sql("""
SELECT EmpID, Name, Department, JoinDate, Salary
FROM employees
WHERE JoinDate > '2021-12-31' AND Salary > 70000
""").show()


+-----+------+-----------+----------+------+
|EmpID|  Name| Department|  JoinDate|Salary|
+-----+------+-----------+----------+------+
|    3|Simran|Engineering|2022-07-10| 75000|
+-----+------+-----------+----------+------+



In [20]:
# 8. Advanced (Optional)
# Use a UDF to classify department as "Tech" vs "Non-Tech"
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify_dept(dept):
    return "Tech" if dept.lower() == "engineering" else "Non-Tech"

classify_udf = udf(classify_dept, StringType())

emp_classify = employees_df.withColumn("DeptType", classify_udf(col("Department")))
emp_classify.show()


+-----+------+-----------+----------+------+---------+-----------+--------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|DeptType|
+-----+------+-----------+----------+------+---------+-----------+--------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|Non-Tech|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|    Tech|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    Tech|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|Non-Tech|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|Non-Tech|
+-----+------+-----------+----------+------+---------+-----------+--------+



In [37]:
from pyspark.sql.functions import count, sum as _sum

# Create a view emp_attendance_summary
attendance_data = attendance_df.groupBy("EmpID").agg(
        count("*").alias("TotalDays"),
        _sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
    ).withColumn("AttendanceRate", (col("PresentDays") / col("TotalDays")).cast("double"))

emp_attendance_summary = employees_df.join(attendance_data, "EmpID", "left")

emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")

spark.sql("SELECT * FROM emp_attendance_summary").show()


+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|TotalDays|PresentDays|AttendanceRate|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|        2|          2|           1.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|        2|          1|           0.5|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|        2|          2|           1.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|        2|          0|           0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|        2|          2|           1.0|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------+



In [38]:
# Save it as Parquet partitioned by Department
emp_attendance_summary.write .mode("overwrite").partitionBy("Department").parquet("emp_attendance_summary_parquet")
parquet_df = spark.read.parquet("emp_attendance_summary_parquet")
parquet_df.printSchema()
parquet_df.show()


root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)
 |-- TenureYears: double (nullable = true)
 |-- TotalDays: long (nullable = true)
 |-- PresentDays: long (nullable = true)
 |-- AttendanceRate: double (nullable = true)
 |-- Department: string (nullable = true)

+-----+------+----------+------+---------+-----------+---------+-----------+--------------+-----------+
|EmpID|  Name|  JoinDate|Salary|ManagerID|TenureYears|TotalDays|PresentDays|AttendanceRate| Department|
+-----+------+----------+------+---------+-----------+---------+-----------+--------------+-----------+
|    1| Anita|2021-05-01| 55000|     NULL|       4.11|        2|          2|           1.0|         HR|
|    5| Nisha|2023-01-05| 50000|        1|       2.43|        2|          2|           1.0|         HR|
|    2|   Raj|2020-03-15| 80000|        1|       5.24|        2|      