In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeAttendanceAnalysis").getOrCreate()



In [2]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv


In [7]:
from pyspark.sql.functions import col, to_timestamp, hour, avg,minute
df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
df = df.withColumn("clock_in", to_timestamp("clock_in")).withColumn("clock_out", to_timestamp("clock_out"))
df = df.withColumn("work_hours", (col("clock_out").cast("long") - col("clock_in").cast("long")) / 3600)
df.show()


+-----------+-----+----------+-------------------+-------------------+----------+
|employee_id| name|department|           clock_in|          clock_out|work_hours|
+-----------+-----+----------+-------------------+-------------------+----------+
|          1| Arun|        HR|2025-07-24 09:00:00|2025-07-24 17:30:00|       8.5|
|          2|Divya|   Finance|2025-07-24 10:00:00|2025-07-24 16:30:00|       6.5|
|          3| Ravi|        HR|2025-07-24 09:15:00|2025-07-24 17:00:00|      7.75|
|          1| Arun|        HR|2025-07-25 09:00:00|2025-07-25 17:45:00|      8.75|
|          2|Divya|   Finance|2025-07-25 10:00:00|2025-07-25 16:00:00|       6.0|
|          3| Ravi|        HR|2025-07-25 09:30:00|2025-07-25 16:30:00|       7.0|
+-----------+-----+----------+-------------------+-------------------+----------+



In [8]:
df = df.withColumn("login_hour", hour("clock_in")).withColumn("login_minute", minute("clock_in"))
late_logins = df.filter((col("login_hour") > 9) | ((col("login_hour") == 9) & (col("login_minute") > 30)))
late_logins.select("employee_id", "name", "clock_in").show()


+-----------+-----+-------------------+
|employee_id| name|           clock_in|
+-----------+-----+-------------------+
|          2|Divya|2025-07-24 10:00:00|
|          2|Divya|2025-07-25 10:00:00|
+-----------+-----+-------------------+



In [15]:
from pyspark.sql.functions import when, count
df = df.withColumn("is_late", when((col("login_hour") > 9) | ((col("login_hour") == 9) & (col("login_minute") > 30)), 1).otherwise(0))
df = df.withColumn("work_hours", (col("clock_out").cast("long") - col("clock_in").cast("long")) / 3600)
dept_summary = df.groupBy("department").agg(
    count("employee_id").alias("total_logs"),
    avg("work_hours").alias("avg_work_hours"),
    avg("is_late").alias("late_login_rate")
)
print("\nAttendance Issues by Department:")
dept_summary.show(truncate=False)


Attendance Issues by Department:
+----------+----------+--------------+---------------+
|department|total_logs|avg_work_hours|late_login_rate|
+----------+----------+--------------+---------------+
|HR        |4         |8.0           |0.0            |
|Finance   |2         |6.25          |1.0            |
+----------+----------+--------------+---------------+

