#Week-4

In [1]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, count, round, to_timestamp
spark = SparkSession.builder.appName("Week4_ETL").getOrCreate()




In [2]:
from google.colab import files
uploaded = files.upload()


Saving attendance.csv to attendance.csv


In [3]:
from google.colab import files
uploaded = files.upload()


Saving tasks.csv to tasks.csv


In [4]:
attendance_df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
tasks_df = spark.read.csv("tasks.csv", header=True, inferSchema=True)

attendance_df.show()
tasks_df.show()


+-----------+-----+----------+-------------------+-------------------+
|employee_id| name|department|           clock_in|          clock_out|
+-----------+-----+----------+-------------------+-------------------+
|          1| Arun|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|
|          2|Divya|        HR|2025-07-21 10:00:00|2025-07-21 16:30:00|
|          3| Ravi|   Finance|2025-07-21 09:15:00|2025-07-21 17:00:00|
|          4|Manoj|   Finance|2025-07-21 09:30:00|2025-07-21 17:15:00|
|          5|Sneha|   Finance|2025-07-21 09:10:00|2025-07-21 17:00:00|
|          6|Kiran|        IT|2025-07-21 09:05:00|2025-07-21 17:20:00|
|          7|Priya|        IT|2025-07-21 10:10:00|2025-07-21 16:40:00|
|          8| Anil|        IT|2025-07-21 09:20:00|2025-07-21 17:05:00|
|          9|Deepa|        IT|2025-07-21 09:40:00|2025-07-21 16:50:00|
|         10| Ajay|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|
|          1| Arun|        HR|2025-07-22 09:00:00|2025-07-22 17:30:00|
|     

In [5]:
attendance_df = attendance_df.withColumn("clock_in", to_timestamp("clock_in")) \
                             .withColumn("clock_out", to_timestamp("clock_out"))

attendance_df = attendance_df.withColumn(
    "work_hours",
    (col("clock_out").cast("long") - col("clock_in").cast("long"))/3600
)

attendance_df = attendance_df.withColumn(
    "is_late",
    when(col("clock_in").substr(12,5) > "09:30", 1).otherwise(0)
)


In [6]:
completed_tasks = tasks_df.filter(col("status")=="Completed") \
                          .groupBy("employee_id").agg(count("task_description").alias("tasks_completed"))


In [7]:
final_df = attendance_df.join(completed_tasks, "employee_id", "left") \
                        .withColumn("tasks_completed", when(col("tasks_completed").isNull(), 0).otherwise(col("tasks_completed")))

final_df = final_df.withColumn("productivity",
                               when(col("work_hours") > 0, col("tasks_completed")/col("work_hours")).otherwise(0))

final_df.show()


+-----------+-----+----------+-------------------+-------------------+-----------------+-------+---------------+-------------------+
|employee_id| name|department|           clock_in|          clock_out|       work_hours|is_late|tasks_completed|       productivity|
+-----------+-----+----------+-------------------+-------------------+-----------------+-------+---------------+-------------------+
|          1| Arun|        HR|2025-07-21 09:00:00|2025-07-21 17:30:00|              8.5|      0|              6| 0.7058823529411765|
|          2|Divya|        HR|2025-07-21 10:00:00|2025-07-21 16:30:00|              6.5|      1|              3|0.46153846153846156|
|          3| Ravi|   Finance|2025-07-21 09:15:00|2025-07-21 17:00:00|             7.75|      0|              6| 0.7741935483870968|
|          4|Manoj|   Finance|2025-07-21 09:30:00|2025-07-21 17:15:00|             7.75|      0|              6| 0.7741935483870968|
|          5|Sneha|   Finance|2025-07-21 09:10:00|2025-07-21 17:00:00

In [8]:
dept_summary = final_df.groupBy("department").agg(
    round(avg("work_hours"),2).alias("avg_hours"),
    round(avg("is_late"),2).alias("late_rate"),
    round(avg("tasks_completed"),2).alias("avg_tasks"),
    round(avg("productivity"),2).alias("avg_productivity")
)

dept_summary.show()


+----------+---------+---------+---------+----------------+
|department|avg_hours|late_rate|avg_tasks|avg_productivity|
+----------+---------+---------+---------+----------------+
|        HR|      7.9|     0.29|     5.14|            0.64|
|   Finance|     7.66|     0.12|     5.29|            0.69|
|        IT|     7.34|      0.5|     5.33|            0.73|
+----------+---------+---------+---------+----------------+



In [9]:

import shutil
shutil.rmtree("department_summary.csv", ignore_errors=True)
dept_summary.toPandas().to_csv("department_summary.csv", index=False)

from google.colab import files
files.download("department_summary.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>