In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions

In [2]:
spark = SparkSession.builder.appName("DevOps").getOrCreate()
spark

In [3]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv


In [4]:
# Load attendance data
attendance_df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
attendance_df.show()

+-----------+-----+----------+-------------------+-------------------+
|employee_id| name|department|           clock_in|          clock_out|
+-----------+-----+----------+-------------------+-------------------+
|          1| Arun|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|
|          2|Divya|        HR|2025-07-21 10:00:00|2025-07-21 16:30:00|
|          3| Ravi|   Finance|2025-07-21 09:15:00|2025-07-21 17:00:00|
|          4|Manoj|   Finance|2025-07-21 09:30:00|2025-07-21 17:15:00|
|          5|Sneha|   Finance|2025-07-21 09:10:00|2025-07-21 17:00:00|
|          6|Kiran|        IT|2025-07-21 09:05:00|2025-07-21 17:20:00|
|          7|Priya|        IT|2025-07-21 10:10:00|2025-07-21 16:40:00|
|          8| Anil|        IT|2025-07-21 09:20:00|2025-07-21 17:05:00|
|          9|Deepa|        IT|2025-07-21 09:40:00|2025-07-21 16:50:00|
|         10| Ajay|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|
|          1| Arun|        HR|2025-07-22 09:00:00|2025-07-22 17:30:00|
|     

In [5]:
from pyspark.sql.functions import countDistinct, col, to_date

# Extract the date from the clock_in time
attendance_df_with_date = attendance_df.withColumn("attendance_date", to_date("clock_in"))

# Get all distinct dates
all_dates = attendance_df_with_date.select("attendance_date").distinct()

# Get all distinct employees
all_employees = attendance_df_with_date.select("employee_id", "name").distinct()

# Cross join employees and dates → all possible employee-date combinations
all_combinations = all_employees.crossJoin(all_dates)

# Find absent combinations (where employee did NOT attend)
absent_combinations = all_combinations.exceptAll(
    attendance_df_with_date.select("employee_id", "name", "attendance_date")
)

# Count absent days per employee
absenteeism_df = absent_combinations.groupBy("employee_id", "name") \
                                   .agg(countDistinct("attendance_date").alias("absent_days"))

# Get Top 5 Absentees
top5_absentees = absenteeism_df.orderBy(col("absent_days").desc()).limit(5)

top5_absentees.show()


+-----------+-----+-----------+
|employee_id| name|absent_days|
+-----------+-----+-----------+
|          9|Deepa|          2|
|          2|Divya|          2|
|          6|Kiran|          2|
|         10| Ajay|          2|
|          8| Anil|          1|
+-----------+-----+-----------+



In [6]:
# Find the top 5 absentees
top_absentees_df = absenteeism_df.orderBy(col("absent_days").desc()).limit(5)

# Display the top 5 absentees
top_absentees_df.show()

+-----------+-----+-----------+
|employee_id| name|absent_days|
+-----------+-----+-----------+
|          9|Deepa|          2|
|          2|Divya|          2|
|          6|Kiran|          2|
|         10| Ajay|          2|
|          8| Anil|          1|
+-----------+-----+-----------+



In [8]:
from google.colab import files
uploaded = files.upload()

Saving tasks.csv to tasks.csv


In [10]:
tasks_df = spark.read.csv("tasks.csv", header=True, inferSchema=True)
tasks_df.show()

+-----------+-----+----------+--------------------+----------+---------+
|employee_id| name|department|    task_description| task_date|   status|
+-----------+-----+----------+--------------------+----------+---------+
|          1| Arun|        HR|Updated HR documents|2025-07-21|Completed|
|          2|Divya|        HR|Handled recruitme...|2025-07-21|Completed|
|          3| Ravi|   Finance|Resolved employee...|2025-07-21|Completed|
|          4|Manoj|   Finance|Prepared monthly ...|2025-07-21|Completed|
|          5|Sneha|   Finance|Finalized salary ...|2025-07-21|  Pending|
|          6|Kiran|        IT|Audited leave rec...|2025-07-21|Completed|
|          7|Priya|        IT|Conducted onboard...|2025-07-21|Completed|
|          8| Anil|        IT|Reviewed payroll ...|2025-07-21|Completed|
|          9|Deepa|        IT|Prepared complian...|2025-07-21|  Pending|
|         10| Ajay|        HR|Verified expense ...|2025-07-21|Completed|
|          1| Arun|        HR|    Updated policies|

In [11]:
from pyspark.sql.functions import col, avg, when, count

# Calculate productivity = tasks_completed / work_hours
tasks_completed = tasks_df.filter(col("status")=="Completed") \
                          .groupBy("employee_id") \
                          .count() \
                          .withColumnRenamed("count", "tasks_completed")

attendance_df = attendance_df.withColumn("work_hours",
    (col("clock_out").cast("long") - col("clock_in").cast("long"))/3600)

# Join attendance + tasks + department info
final_df = attendance_df.join(tasks_completed, "employee_id", "left") \
                        .withColumn("tasks_completed", when(col("tasks_completed").isNull(), 0)
                        .otherwise(col("tasks_completed")))

# Calculate productivity for each employee
final_df = final_df.withColumn("productivity",
                               when(col("work_hours") > 0, col("tasks_completed")/col("work_hours"))
                               .otherwise(0))

# Aggregate department-wise
dept_summary = final_df.groupBy("department").agg(
    avg("productivity").alias("avg_productivity"),
    avg("work_hours").alias("avg_work_hours")
)

# Sort by lowest productivity
lowest_departments = dept_summary.orderBy(col("avg_productivity").asc())

lowest_departments.show()


+----------+------------------+-----------------+
|department|  avg_productivity|   avg_work_hours|
+----------+------------------+-----------------+
|        HR|0.6377135469572445|7.904761904761904|
|   Finance|0.6939461533361988|7.656862745098039|
|        IT|0.7293856243727982|7.337962962962964|
+----------+------------------+-----------------+



In [12]:
top5_absentees.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/top5_absentees")
lowest_departments.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/lowest_departments")

# Rename CSV files for easier downloading

import shutil, glob
abs_file = glob.glob("/content/top5_absentees/part-*.csv")[0]
shutil.move(abs_file, "/content/top5_absentees.csv")

low_file = glob.glob("/content/lowest_departments/part-*.csv")[0]
shutil.move(low_file, "/content/lowest_departments.csv")

# STEP 8: Download Final Reports

from google.colab import files
files.download("/content/top5_absentees.csv")
files.download("/content/lowest_departments.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>