In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder.appName("deo").enableHiveSupport().getOrCreate()

In [3]:
spark

# **Ingestion & Exploration**

In [4]:
# 1.1 Read all 3 files (CSV + JSON) using PySpark.
df_employees = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
df_attendence = spark.read.csv(r"/content/attendence.csv", header=True, inferSchema=True)
df_bonous = spark.read.option("multiline", "true").json(r"/content/bounuses.json")

In [5]:
# 1.2 Show schemas and sample records.
df_attendence.show()
df_attendence.printSchema()
print("--------------------------------------------------------------------")
df_employees.show()
df_employees.printSchema()
print("--------------------------------------------------------------------")
df_bonous.show()
df_bonous.printSchema()
print("--------------------------------------------------------------------")

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-04-01| Absent|
|    2|2024-04-02|Present|
|    3|2024-04-01|Present|
|    3|2024-04-02|Present|
|    4|2024-04-01| Absent|
|    4|2024-04-02| Absent|
|    5|2024-04-01|Present|
|    5|2024-04-02|Present|
+-----+----------+-------+

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

--------------------------------------------------------------------
+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|       

In [6]:
# 1.3 Count distinct departments.
deptCount = df_employees.select("Department").distinct().count()
print(f"Distinct department count: {deptCount}")

Distinct department count: 3


# **DataFrame Operations**

In [7]:
# 2.1 Add a column TenureYears using datediff() and round() .
df_employees = df_employees.withColumn("TenureYears", F.abs(F.round((F.date_diff("JoinDate", F.current_date()) / 365), 2)))
df_employees.show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



In [8]:
# 2.2 Calculate TotalCompensation = Salary + Bonus
df_emp_bon = df_employees.join(df_bonous, on="EmpID", how="inner")
df_emp_bon.withColumn("TotalCompensation", df_emp_bon.Salary + df_emp_bon.Bonus).show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [9]:
# 2.3 Filter employees with more than 2 years in the company.
df_employees.filter(df_employees.TenureYears > 2).show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



In [10]:
# 2.4 Show employees who report to a manager ( ManagerID is not null ).
df_employees.filter(df_employees.ManagerID.isNotNull()).show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



# **Aggregation**

In [11]:
# 3.1 Average salary per department.
df_employees.groupBy("Department").agg(
    F.mean(df_employees.Salary).alias("AverageSalary")
).show()

+-----------+-------------+
| Department|AverageSalary|
+-----------+-------------+
|Engineering|      77500.0|
|         HR|      52500.0|
|  Marketing|      60000.0|
+-----------+-------------+



In [12]:
# 3.2 Number of employees under each manager.
df_employees.filter(df_employees.ManagerID.isNotNull()).groupBy("ManagerID").agg(
    F.count("EmpID").alias("CountEmployee")
).show()

+---------+-------------+
|ManagerID|CountEmployee|
+---------+-------------+
|        1|            4|
+---------+-------------+



In [13]:
# 3.3 Count of absences per employee.
df_attendence.filter(df_attendence.Status == "Absent").groupBy("EmpID").agg(
    F.count("Status").alias("AbscentCount")
).show()

+-----+------------+
|EmpID|AbscentCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



# **Joins**

In [14]:
# 4.1 Join employees and attendance → Get attendance % (Present days / Total days).
emp_att = df_employees.join(df_attendence, on="EmpID", how="inner")
att = emp_att.groupBy("EmpID").agg(
    F.count("*").alias("TotalDays"),
    F.sum(F.when(F.col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
)
att_summary = att.withColumn("Attendance%", (att.PresentDays / att.TotalDays) * 100).join(df_employees.select("EmpID", "Department"), on="EmpID", how="inner")
att_summary.show()

+-----+---------+-----------+-----------+-----------+
|EmpID|TotalDays|PresentDays|Attendance%| Department|
+-----+---------+-----------+-----------+-----------+
|    1|        2|          2|      100.0|         HR|
|    3|        2|          2|      100.0|Engineering|
|    5|        2|          2|      100.0|         HR|
|    4|        2|          0|        0.0|  Marketing|
|    2|        2|          1|       50.0|Engineering|
+-----+---------+-----------+-----------+-----------+



In [15]:
# 4.2 Join employees and bonuses → Show top 3 employees by TotalCompensation.
emp_bon = df_employees.join(df_bonous, on="EmpID", how="inner")
emp_bon = emp_bon.withColumn("TotalCompensation", emp_bon.Salary + emp_bon.Bonus)
emp_bon.select(["Name", "TotalCompensation"]).sort("TotalCompensation", ascending=False).show(3)

+------+-----------------+
|  Name|TotalCompensation|
+------+-----------------+
|   Raj|            87000|
|Simran|            81500|
| Aamir|            66000|
+------+-----------------+
only showing top 3 rows



In [16]:
# 4.3 Multi-level join: employees + bonuses + attendance .
emp_bon_att = emp_bon.join(df_attendence, on="EmpID", how="inner")
emp_bon_att.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|      Date| Status|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|2024-04-02|Present|
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|2024-04-01|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|2024-04-02|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|2024-04-01| Absent|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|2024-04-02|Present|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|   

# **String & Date Functions**

In [17]:
# 5.1 Extract year and month from JoinDate .
df_employees.withColumns(
    {
        "year": F.year(df_employees.JoinDate) ,
        "month": F.month(df_employees.JoinDate)
    }
    ).select(["Name", "year", "month"]).show()

+------+----+-----+
|  Name|year|month|
+------+----+-----+
| Anita|2021|    5|
|   Raj|2020|    3|
|Simran|2022|    7|
| Aamir|2019|   11|
| Nisha|2023|    1|
+------+----+-----+



In [18]:
# 5.2 Mask employee names using regex.
df_employees.select("Name").withColumn("NameMask",
    F.concat(
        F.substring("Name", 1, 1),
        F.regexp_replace(F.substring("Name", 2, 50), ".", "*")
    )
).show()

+------+--------+
|  Name|NameMask|
+------+--------+
| Anita|   A****|
|   Raj|     R**|
|Simran|  S*****|
| Aamir|   A****|
| Nisha|   N****|
+------+--------+



In [19]:
# 5.3 Use substring() to create EmpCode like "EMP001".
df_employees.withColumn(
    "EmpCode",
    F.concat(
        F.lit("Emp"),
        F.format_string("%03d", F.col("EmpID"))
    )
).show()

+-----+------+-----------+----------+------+---------+-----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| Emp001|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| Emp002|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| Emp003|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| Emp004|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| Emp005|
+-----+------+-----------+----------+------+---------+-----------+-------+



# **Conditional & Null Handling**

In [20]:
# 6.1 Use when/otherwise to label performance
df_bonous.withColumn("PerformanceLabel", F.when(df_bonous.Bonus > 6000, "High").when((df_bonous.Bonus >= 4000) & (df_bonous.Bonus <= 6000), "Medium").otherwise("Low")).show()

+-----+-----+----+----------------+
|Bonus|EmpID|Year|PerformanceLabel|
+-----+-----+----+----------------+
| 5000|    1|2023|          Medium|
| 7000|    2|2023|            High|
| 6500|    3|2023|            High|
| 6000|    4|2023|          Medium|
| 4000|    5|2023|          Medium|
+-----+-----+----+----------------+



In [21]:
# 6.2 Handle missing ManagerID using fillna("No Manager")
df_employees = df_employees.withColumn("ManagerID", F.col("ManagerID").cast("string"))
df_employees.fillna("No Manager", subset=["ManagerID"]).show()

+-----+------+-----------+----------+------+----------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary| ManagerID|TenureYears|
+-----+------+-----------+----------+------+----------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|No Manager|       4.11|
|    2|   Raj|Engineering|2020-03-15| 80000|         1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|         1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|         1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|         1|       2.43|
+-----+------+-----------+----------+------+----------+-----------+



# **Spark SQL**

In [22]:
# 7.1 Create and use database hr .
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

DataFrame[]

In [23]:
# 7.2 Save all DataFrames as tables: employees , attendance , bonuses .
df_employees.write.saveAsTable("hr.employees")
df_attendence.write.saveAsTable("hr.attendance")
df_bonous.write.saveAsTable("hr.bonous")

In [24]:
# 7.3.1 Top paid employee in each department.
spark.sql("""
  SELECT e.Department, e.Name, e.Salary
  FROM employees e
  JOIN (
      SELECT Department, MAX(Salary) AS MaxSal
      FROM employees
      GROUP BY Department
  ) m ON e.Department = m.Department AND e.Salary = m.MaxSal
""").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+



In [25]:
# 7.3.2 Attendance rate by department.
att_dept = spark.sql("""
  SELECT e.Department, (SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) / COUNT(*)) * 100 AS AttendanceRate FROM employees e
  INNER JOIN attendance a
  ON a.EmpID = e.EmpID
  GROUP BY Department
""")
att_dept.show()

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          75.0|
|         HR|         100.0|
|  Marketing|           0.0|
+-----------+--------------+



In [26]:
# 7.3.3 Employees joined after 2021 with salary > 70,000.
spark.sql("""
  SELECT * FROM employees
  WHERE YEAR(JoinDate) > 2021 AND Salary > 70000
""").show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
+-----+------+-----------+----------+------+---------+-----------+



# **Advanced**

In [27]:
# 8.1 Use a UDF to classify department as "Tech" vs "Non-Tech".
def classifier(dept):
  nonTechDepts = ["Marketing", "HR"]
  if dept in nonTechDepts:
    return "Non-Tech"
  else:
    return "Tech"

spark.udf.register("classifier", classifier)

spark.sql("""
  SELECT Department, classifier(Department) AS DeptCategory FROM employees
""").show()

+-----------+------------+
| Department|DeptCategory|
+-----------+------------+
|         HR|    Non-Tech|
|Engineering|        Tech|
|Engineering|        Tech|
|  Marketing|    Non-Tech|
|         HR|    Non-Tech|
+-----------+------------+



In [28]:
# 8.2 Create a view emp_attendance_summary .
att_summary.createOrReplaceTempView("emp_attendance_summary")
emp_attendance_summary = spark.sql("SELECT * FROM emp_attendance_summary")
emp_attendance_summary.show()

+-----+---------+-----------+-----------+-----------+
|EmpID|TotalDays|PresentDays|Attendance%| Department|
+-----+---------+-----------+-----------+-----------+
|    1|        2|          2|      100.0|         HR|
|    3|        2|          2|      100.0|Engineering|
|    5|        2|          2|      100.0|         HR|
|    4|        2|          0|        0.0|  Marketing|
|    2|        2|          1|       50.0|Engineering|
+-----+---------+-----------+-----------+-----------+



In [29]:
# 8.3 Save it as Parquet partitioned by Department .
emp_attendance_summary.write.mode("overwrite").parquet(path=r"/content", partitionBy="Department")