In [1]:
from google.colab import files
uploaded = files.upload()


Saving cleaned_attendance.csv to cleaned_attendance.csv


In [3]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Week3_Attendence_Analysis').getOrCreate()
attendence_df=spark.read.csv("/content/cleaned_attendance.csv", header=True, inferSchema=True)
attendence_df.show(5)
attendence_df.printSchema()

+-----------+----------+-------------------+-------------------+-------------------+-------------------+-----------------+--------+---------------+---------------+-------------------+
|employee_id|      date|           clock_in|          clock_out|      clock_in_full|     clock_out_full|       work_hours|    name|department_name|tasks_completed| productivity_score|
+-----------+----------+-------------------+-------------------+-------------------+-------------------+-----------------+--------+---------------+---------------+-------------------+
|          1|2025-07-20|2025-08-01 09:00:00|2025-08-01 17:00:00|2025-07-20 09:00:00|2025-07-20 17:00:00|              8.0|Rishitha|             HR|              1|              0.125|
|          1|2025-07-21|2025-08-01 09:15:00|2025-08-01 17:05:00|2025-07-21 09:15:00|2025-07-21 17:05:00|7.833333333333333|Rishitha|             HR|              1|0.12765957446808512|
|          2|2025-07-21|2025-08-01 09:30:00|2025-08-01 16:50:00|2025-07-21 09:30

In [5]:
from pyspark.sql.functions import to_timestamp, col, round, unix_timestamp
# 1. Converting login_time and logout_time to timestamp type
attendence_df=attendence_df.withColumn("clock_in", to_timestamp(col("clock_in"), "yyyy-MM-dd HH:mm:ss"))
attendence_df=attendence_df.withColumn("clock_out",to_timestamp(col("clock_out"),"yyyy-MM-dd HH:mm:ss"))
# 2. Calculating work hours in decimal format
attendence_df=attendence_df.withColumn("work_hours",round((unix_timestamp("clock_out")-unix_timestamp("clock_in"))/3600,2))
#Showing Results
attendence_df.select("employee_id", "clock_in", "clock_out", "work_hours").show(5)

+-----------+-------------------+-------------------+----------+
|employee_id|           clock_in|          clock_out|work_hours|
+-----------+-------------------+-------------------+----------+
|          1|2025-08-01 09:00:00|2025-08-01 17:00:00|       8.0|
|          1|2025-08-01 09:15:00|2025-08-01 17:05:00|      7.83|
|          2|2025-08-01 09:30:00|2025-08-01 16:50:00|      7.33|
|          3|2025-08-01 10:00:00|2025-08-01 17:00:00|       7.0|
|          4|2025-08-01 09:00:00|2025-08-01 17:00:00|       8.0|
+-----------+-------------------+-------------------+----------+
only showing top 5 rows



In [7]:
#Filter for late logins
from pyspark.sql.functions import hour
#Filter rows where login_time hour is greater than 10
late_logins_df=attendence_df.filter(hour("clock_in")>10)
late_logins_df.select("employee_id", "clock_in", "department_name").show(5)

+-----------+--------+---------------+
|employee_id|clock_in|department_name|
+-----------+--------+---------------+
+-----------+--------+---------------+



In [9]:
attendence_df.printSchema()
attendence_df.select("employee_id", "clock_in").show(10)

root
 |-- employee_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- clock_in: timestamp (nullable = true)
 |-- clock_out: timestamp (nullable = true)
 |-- clock_in_full: timestamp (nullable = true)
 |-- clock_out_full: timestamp (nullable = true)
 |-- work_hours: double (nullable = true)
 |-- name: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- tasks_completed: integer (nullable = true)
 |-- productivity_score: double (nullable = true)

+-----------+-------------------+
|employee_id|           clock_in|
+-----------+-------------------+
|          1|2025-08-01 09:00:00|
|          1|2025-08-01 09:15:00|
|          2|2025-08-01 09:30:00|
|          3|2025-08-01 10:00:00|
|          4|2025-08-01 09:00:00|
|          5|2025-08-01 09:10:00|
|          6|2025-08-01 09:00:00|
|          7|2025-08-01 09:45:00|
|          8|2025-08-01 08:50:00|
|          1|2025-08-01 09:00:00|
+-----------+-------------------+
only showing top 10 rows



In [20]:
from pyspark.sql.functions import col

# Filter rows where both clock_in and clock_out are null
absentees_df = attendence_df.filter(
    col("clock_in").isNull() & col("clock_out").isNull()
)

# Show a few absentees
absentees_df.select("employee_id", "name", "department_name").show(5)

#No absentees were found in the dataset.
#All employees had clock-in and clock-out records for the period analyzed.



+-----------+----+---------------+
|employee_id|name|department_name|
+-----------+----+---------------+
+-----------+----+---------------+



In [30]:
#Average work hours per department
from pyspark.sql.functions import avg
avg_work_hours_df=attendence_df.groupBy("department_name").agg(round(avg("work_hours"),2).alias("avg_work_hours"))
avg_work_hours_df.show()

+---------------+--------------+
|department_name|avg_work_hours|
+---------------+--------------+
|          Sales|          7.17|
|             HR|           7.9|
|        Finance|          8.33|
|      Marketing|          8.33|
|             IT|          7.65|
+---------------+--------------+



In [24]:
#Late Logins Per Department
late_counts_df = late_logins_df.groupBy("department_name") .count() .withColumnRenamed("count", "late_logins")

late_counts_df.show()



+---------------+-----------+
|department_name|late_logins|
+---------------+-----------+
+---------------+-----------+



In [25]:
#Absentees Per Department
absent_counts_df = absentees_df.groupBy("department_name") .count() .withColumnRenamed("count", "absentees")

absent_counts_df.show()


+---------------+---------+
|department_name|absentees|
+---------------+---------+
+---------------+---------+



In [31]:
#Join All Results Together
# Join avg work hours and late logins
dept_summary_df = avg_hours_df.join(late_counts_df, on="department_name", how="left")

# Join with absentees data too (will be empty, but let's include for structure)
dept_summary_df = dept_summary_df.join(absent_counts_df, on="department_name", how="left")

# Fill nulls with 0 where there were no late logins or absentees
dept_summary_df = dept_summary_df.fillna(0)

dept_summary_df.show()


+---------------+--------------+-----------+---------+
|department_name|avg_work_hours|late_logins|absentees|
+---------------+--------------+-----------+---------+
|          Sales|          7.17|          0|        0|
|             HR|           7.9|          0|        0|
|        Finance|          8.33|          0|        0|
|      Marketing|          8.33|          0|        0|
|             IT|          7.65|          0|        0|
+---------------+--------------+-----------+---------+



In [33]:
dept_summary_df.toPandas().to_csv("department_summary.csv", index=False)
