In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, count, to_timestamp, hour, minute

In [2]:
# Initialize Spark
spark = SparkSession.builder.appName("AttendanceAnalysis").getOrCreate()

In [3]:
from google.colab import files
uploaded = files.upload()

Saving attendance_logs.csv to attendance_logs.csv


In [6]:
df = spark.read.csv("attendance_logs.csv", header=True, inferSchema=True)

In [7]:
from pyspark.sql.functions import to_timestamp, col

df = df.withColumn("clockin", to_timestamp(col("clockin"), "yyyy-MM-dd HH:mm:ss"))
df = df.withColumn("clockout", to_timestamp(col("clockout"), "yyyy-MM-dd HH:mm:ss"))
df.show()

+----------+------+----------+-------------------+-------------------+--------------+
|employeeid|  name|department|            clockin|           clockout|taskscompleted|
+----------+------+----------+-------------------+-------------------+--------------+
|         1|Nithya|        IT|2025-06-01 09:05:00|2025-06-01 17:30:00|             5|
|         2| Vivek|        HR|2025-06-01 09:30:00|2025-06-01 18:00:00|             3|
|         3| Deepa|  Accounts|2025-06-01 08:50:00|2025-06-01 17:00:00|             2|
|         4|  Arun|        IT|               NULL|2025-06-01 17:15:00|             4|
|         5| Priya|        HR|2025-06-01 09:20:00|               NULL|             3|
+----------+------+----------+-------------------+-------------------+--------------+



In [8]:
# Calculate work hours
df = df.withColumn("workhours",
                   (col("clockout").cast("long") - col("clockin").cast("long")) / 3600)
df.show()

+----------+------+----------+-------------------+-------------------+--------------+-----------------+
|employeeid|  name|department|            clockin|           clockout|taskscompleted|        workhours|
+----------+------+----------+-------------------+-------------------+--------------+-----------------+
|         1|Nithya|        IT|2025-06-01 09:05:00|2025-06-01 17:30:00|             5|8.416666666666666|
|         2| Vivek|        HR|2025-06-01 09:30:00|2025-06-01 18:00:00|             3|              8.5|
|         3| Deepa|  Accounts|2025-06-01 08:50:00|2025-06-01 17:00:00|             2|8.166666666666666|
|         4|  Arun|        IT|               NULL|2025-06-01 17:15:00|             4|             NULL|
|         5| Priya|        HR|2025-06-01 09:20:00|               NULL|             3|             NULL|
+----------+------+----------+-------------------+-------------------+--------------+-----------------+



In [9]:
# Identify late logins (after 09:15 AM)
df = df.withColumn("late_login",
                   when((hour(col("clockin")) > 9) |
                        ((hour(col("clockin")) == 9) & (minute(col("clockin")) > 15)), 1).otherwise(0))
df.show()

+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+
|employeeid|  name|department|            clockin|           clockout|taskscompleted|        workhours|late_login|
+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+
|         1|Nithya|        IT|2025-06-01 09:05:00|2025-06-01 17:30:00|             5|8.416666666666666|         0|
|         2| Vivek|        HR|2025-06-01 09:30:00|2025-06-01 18:00:00|             3|              8.5|         1|
|         3| Deepa|  Accounts|2025-06-01 08:50:00|2025-06-01 17:00:00|             2|8.166666666666666|         0|
|         4|  Arun|        IT|               NULL|2025-06-01 17:15:00|             4|             NULL|         0|
|         5| Priya|        HR|2025-06-01 09:20:00|               NULL|             3|             NULL|         1|
+----------+------+----------+-------------------+-------------------+----------

In [10]:
# Identify absences (missing clock-in or clock-out)
df = df.withColumn("absent",
                   when(col("clockin").isNull() | col("clockout").isNull(), 1).otherwise(0))
df.show()

+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+------+
|employeeid|  name|department|            clockin|           clockout|taskscompleted|        workhours|late_login|absent|
+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+------+
|         1|Nithya|        IT|2025-06-01 09:05:00|2025-06-01 17:30:00|             5|8.416666666666666|         0|     0|
|         2| Vivek|        HR|2025-06-01 09:30:00|2025-06-01 18:00:00|             3|              8.5|         1|     0|
|         3| Deepa|  Accounts|2025-06-01 08:50:00|2025-06-01 17:00:00|             2|8.166666666666666|         0|     0|
|         4|  Arun|        IT|               NULL|2025-06-01 17:15:00|             4|             NULL|         0|     1|
|         5| Priya|        HR|2025-06-01 09:20:00|               NULL|             3|             NULL|         1|     1|
+----------+------+-----

In [11]:
# Example productivity score calculation if taskscompleted column exists
df = df.withColumn("productivity_score", col("taskscompleted") / col("workhours"))
df.show()

+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+------+-------------------+
|employeeid|  name|department|            clockin|           clockout|taskscompleted|        workhours|late_login|absent| productivity_score|
+----------+------+----------+-------------------+-------------------+--------------+-----------------+----------+------+-------------------+
|         1|Nithya|        IT|2025-06-01 09:05:00|2025-06-01 17:30:00|             5|8.416666666666666|         0|     0| 0.5940594059405941|
|         2| Vivek|        HR|2025-06-01 09:30:00|2025-06-01 18:00:00|             3|              8.5|         1|     0|0.35294117647058826|
|         3| Deepa|  Accounts|2025-06-01 08:50:00|2025-06-01 17:00:00|             2|8.166666666666666|         0|     0| 0.2448979591836735|
|         4|  Arun|        IT|               NULL|2025-06-01 17:15:00|             4|             NULL|         0|     1|               NULL|
|     

In [12]:
# Group by department for analysis
department_summary = df.groupBy("department").agg(
    avg("workhours").alias("avg_workhours"),
    avg("productivity_score").alias("avg_productivity_score"),
    count(when(col("late_login") == 1, True)).alias("late_login_count"),
    count(when(col("absent") == 1, True)).alias("absence_count")
)

In [13]:
# Show summary
department_summary.show()

+----------+-----------------+----------------------+----------------+-------------+
|department|    avg_workhours|avg_productivity_score|late_login_count|absence_count|
+----------+-----------------+----------------------+----------------+-------------+
|        HR|              8.5|   0.35294117647058826|               2|            1|
|  Accounts|8.166666666666666|    0.2448979591836735|               0|            0|
|        IT|8.416666666666666|    0.5940594059405941|               0|            1|
+----------+-----------------+----------------------+----------------+-------------+



In [14]:
# Save for reporting
department_summary.coalesce(1).write.csv("attendance_department_summary", header=True, mode="overwrite")

spark.stop()