In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeAttendanceAnalysis").getOrCreate()



In [4]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv


In [5]:
from pyspark.sql.functions import col, to_timestamp, hour, avg,minute
df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
df = df.withColumn("clock_in", to_timestamp("clock_in")).withColumn("clock_out", to_timestamp("clock_out"))
df = df.withColumn("work_hours", (col("clock_out").cast("long") - col("clock_in").cast("long")) / 3600)
df.show()


+-----------+-----+----------+-------------------+-------------------+-----------------+
|employee_id| name|department|           clock_in|          clock_out|       work_hours|
+-----------+-----+----------+-------------------+-------------------+-----------------+
|          1| Arun|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|              8.5|
|          2|Divya|        HR|2025-07-21 10:00:00|2025-07-21 16:30:00|              6.5|
|          3| Ravi|   Finance|2025-07-21 09:15:00|2025-07-21 17:00:00|             7.75|
|          4|Manoj|   Finance|2025-07-21 09:30:00|2025-07-21 17:15:00|             7.75|
|          5|Sneha|   Finance|2025-07-21 09:10:00|2025-07-21 17:00:00|7.833333333333333|
|          6|Kiran|        IT|2025-07-21 09:05:00|2025-07-21 17:20:00|             8.25|
|          7|Priya|        IT|2025-07-21 10:10:00|2025-07-21 16:40:00|              6.5|
|          8| Anil|        IT|2025-07-21 09:20:00|2025-07-21 17:05:00|             7.75|
|          9|Deepa|  

In [6]:
df = df.withColumn("login_hour", hour("clock_in")).withColumn("login_minute", minute("clock_in"))
late_logins = df.filter((col("login_hour") > 9) | ((col("login_hour") == 9) & (col("login_minute") > 30)))
late_logins.select("employee_id", "name", "clock_in").show()


+-----------+-----+-------------------+
|employee_id| name|           clock_in|
+-----------+-----+-------------------+
|          2|Divya|2025-07-21 10:00:00|
|          7|Priya|2025-07-21 10:10:00|
|          9|Deepa|2025-07-21 09:40:00|
|          2|Divya|2025-07-22 10:00:00|
|          7|Priya|2025-07-22 10:00:00|
|          9|Deepa|2025-07-22 09:40:00|
|          4|Manoj|2025-07-23 09:40:00|
|          7|Priya|2025-07-23 09:50:00|
|          2|Divya|2025-07-24 10:00:00|
|          7|Priya|2025-07-24 10:10:00|
|          9|Deepa|2025-07-24 09:40:00|
|          4|Manoj|2025-07-25 09:40:00|
|          7|Priya|2025-07-25 10:10:00|
|          2|Divya|2025-07-26 10:00:00|
|          9|Deepa|2025-07-26 09:40:00|
+-----------+-----+-------------------+



In [7]:
from pyspark.sql.functions import when, count
df = df.withColumn("is_late", when((col("login_hour") > 9) | ((col("login_hour") == 9) & (col("login_minute") > 30)), 1).otherwise(0))
df = df.withColumn("work_hours", (col("clock_out").cast("long") - col("clock_in").cast("long")) / 3600)
dept_summary = df.groupBy("department").agg(
    count("employee_id").alias("total_logs"),
    avg("work_hours").alias("avg_work_hours"),
    avg("is_late").alias("late_login_rate")
)
print("\nAttendance Issues by Department:")
dept_summary.show(truncate=False)


Attendance Issues by Department:
+----------+----------+-----------------+-------------------+
|department|total_logs|avg_work_hours   |late_login_rate    |
+----------+----------+-----------------+-------------------+
|HR        |14        |7.904761904761904|0.2857142857142857 |
|Finance   |17        |7.656862745098039|0.11764705882352941|
|IT        |18        |7.337962962962964|0.5                |
+----------+----------+-----------------+-------------------+

