<a href="https://colab.research.google.com/github/Rohitnik2266/Data_Warehouse_Training/blob/main/June%209/Pyspark_Assessment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("HR_Analytics").getOrCreate()

In [45]:
#1. Ingestion & Exploration
employees = spark.read.option("header", True).option("inferSchema", True).csv("employees.csv")
attendance = spark.read.option("header", True).option("inferSchema", True).csv("attendance.csv")
bonuses = spark.read.option("multiline", True).json("bonuses.json")


In [46]:
employees.printSchema()
employees.show()

attendance.printSchema()
attendance.show()

bonuses.printSchema()
bonuses.show()

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-

In [47]:
employees.select("Department").distinct().count()

3

In [48]:
#2. DataFrame Operation

employees = employees.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate")) / 365, 2))


In [49]:
emp_bonus = employees.join(bonuses, "EmpID")
emp_bonus = emp_bonus.withColumn("TotalCompensation", col("Salary") + col("Bonus"))

In [50]:
emp_bonus.filter(col("TenureYears") > 2).show()


+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [51]:
employees.filter(col("ManagerID").isNotNull()).show()


+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



In [52]:
#3. Aggregation

employees.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()


+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+



In [53]:
employees.groupBy("ManagerID").count().alias("NumEmployees").show()


+---------+-----+
|ManagerID|count|
+---------+-----+
|     NULL|    1|
|        1|    4|
+---------+-----+



In [54]:
attendance.filter(col("Status") == "Absent").groupBy("EmpID").count().withColumnRenamed("count", "AbsenceCount").show()

+-----+------------+
|EmpID|AbsenceCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



In [55]:
#4. Joins

att_summary = attendance.groupBy("EmpID").pivot("Status").count().fillna(0)
att_summary = att_summary.withColumn("TotalDays", col("Present") + col("Absent"))
att_summary = att_summary.withColumn("AttendancePercent", round(col("Present") / col("TotalDays") * 100, 2))

In [56]:
emp_attendance = employees.join(att_summary, "EmpID", "left")
emp_attendance.select("EmpID", "Name", "AttendancePercent").show()

+-----+------+-----------------+
|EmpID|  Name|AttendancePercent|
+-----+------+-----------------+
|    1| Anita|            100.0|
|    2|   Raj|             50.0|
|    3|Simran|            100.0|
|    4| Aamir|              0.0|
|    5| Nisha|            100.0|
+-----+------+-----------------+



In [57]:
top_comp = emp_bonus.orderBy(col("TotalCompensation").desc()).limit(3)
top_comp.select("EmpID", "Name", "TotalCompensation").show()

+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
+-----+------+-----------------+



In [58]:
multi_join = employees.join(bonuses, "EmpID").join(att_summary, "EmpID")
multi_join.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+------+-------+---------+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|Absent|Present|TotalDays|AttendancePercent|
+-----+------+-----------+----------+------+---------+-----------+-----+----+------+-------+---------+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|     0|      2|        2|            100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|     1|      1|        2|             50.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|     0|      2|        2|            100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|     2|      0|        2|              0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|     0|      2|        2|            100.0|
+-----+------+-----------+------

In [59]:
#String & Date Functions

employees = employees.withColumn("JoinYear", year("JoinDate")).withColumn("JoinMonth", month("JoinDate"))

In [60]:
employees = employees.withColumn("MaskedName", regexp_replace("Name", "(?<=.).", "*"))

In [61]:
employees = employees.withColumn("EmpCode", concat(lit("EMP"), lpad(col("EmpID"), 3, "0")))
employees.select("EmpID", "Name", "EmpCode").show()

+-----+------+-------+
|EmpID|  Name|EmpCode|
+-----+------+-------+
|    1| Anita| EMP001|
|    2|   Raj| EMP002|
|    3|Simran| EMP003|
|    4| Aamir| EMP004|
|    5| Nisha| EMP005|
+-----+------+-------+



In [62]:
#Conditional & Null handling

bonuses = bonuses.withColumn("Performance",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)

In [63]:
employees = employees.fillna({"ManagerID": "No Manager"})


In [64]:
#7. Spark SQL

spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.catalog.setCurrentDatabase("hr")

In [65]:
employees.write.mode("overwrite").saveAsTable("employees")
attendance.write.mode("overwrite").saveAsTable("attendance")
bonuses.write.mode("overwrite").saveAsTable("bonuses")

In [66]:
spark.sql("""
SELECT Department, Name, Salary
FROM employees
WHERE Salary = (SELECT MAX(Salary) FROM employees e2 WHERE e2.Department = employees.Department)
""").show()

spark.sql("""
SELECT e.Department,
       ROUND(100 * SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END)/COUNT(*), 2) AS AttendanceRate
FROM employees e JOIN attendance a ON e.EmpID = a.EmpID
GROUP BY e.Department
""").show()

spark.sql("""
SELECT * FROM employees
WHERE year(JoinDate) > 2021 AND Salary > 70000
""").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          75.0|
|         HR|         100.0|
|  Marketing|           0.0|
+-----------+--------------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7|    S*****| EMP003|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



In [67]:
#8. Advance (optional)

from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def classify_dept(dept):
    return "Tech" if dept in ["Engineering"] else "Non-Tech"

In [68]:
dept_udf = udf(classify_dept, StringType())
employees = employees.withColumn("DeptCategory", dept_udf("Department"))

In [69]:
emp_attendance_summary = emp_attendance.select("EmpID", "Name", "Department", "AttendancePercent")
emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")

In [70]:
emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet("emp_attendance_summary.parquet")