#Week-4

In [5]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, count, round, to_timestamp
spark = SparkSession.builder.appName("Week4_ETL").getOrCreate()




In [2]:
from google.colab import files
uploaded = files.upload()


Saving attendance.csv to attendance.csv


In [3]:
from google.colab import files
uploaded = files.upload()


Saving tasks.csv to tasks.csv


In [6]:
attendance_df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
tasks_df = spark.read.csv("tasks.csv", header=True, inferSchema=True)

attendance_df.show()
tasks_df.show()


+-----------+-----+----------+-------------------+-------------------+
|employee_id| name|department|           clock_in|          clock_out|
+-----------+-----+----------+-------------------+-------------------+
|          1| Arun|        HR|2025-07-24 09:00:00|2025-07-24 17:30:00|
|          2|Divya|   Finance|2025-07-24 10:00:00|2025-07-24 16:30:00|
|          3| Ravi|        HR|2025-07-24 09:15:00|2025-07-24 17:00:00|
|          1| Arun|        HR|2025-07-25 09:00:00|2025-07-25 17:45:00|
|          2|Divya|   Finance|2025-07-25 10:00:00|2025-07-25 16:00:00|
|          3| Ravi|        HR|2025-07-25 09:30:00|2025-07-25 16:30:00|
+-----------+-----+----------+-------------------+-------------------+

+-----------+--------------------+----------+---------+
|employee_id|    task_description| task_date|   status|
+-----------+--------------------+----------+---------+
|          1|Updated HR documents|2025-07-24|Completed|
|          2|Handled recruitme...|2025-07-24|Completed|
|        

In [7]:
attendance_df = attendance_df.withColumn("clock_in", to_timestamp("clock_in")) \
                             .withColumn("clock_out", to_timestamp("clock_out"))

attendance_df = attendance_df.withColumn(
    "work_hours",
    (col("clock_out").cast("long") - col("clock_in").cast("long"))/3600
)

attendance_df = attendance_df.withColumn(
    "is_late",
    when(col("clock_in").substr(12,5) > "09:30", 1).otherwise(0)
)


In [8]:
completed_tasks = tasks_df.filter(col("status")=="Completed") \
                          .groupBy("employee_id").agg(count("task_description").alias("tasks_completed"))


In [10]:
final_df = attendance_df.join(completed_tasks, "employee_id", "left") \
                        .withColumn("tasks_completed", when(col("tasks_completed").isNull(), 0).otherwise(col("tasks_completed")))

final_df = final_df.withColumn("productivity",
                               when(col("work_hours") > 0, col("tasks_completed")/col("work_hours")).otherwise(0))

final_df.show()


+-----------+-----+----------+-------------------+-------------------+----------+-------+---------------+-------------------+
|employee_id| name|department|           clock_in|          clock_out|work_hours|is_late|tasks_completed|       productivity|
+-----------+-----+----------+-------------------+-------------------+----------+-------+---------------+-------------------+
|          1| Arun|        HR|2025-07-24 09:00:00|2025-07-24 17:30:00|       8.5|      0|              2|0.23529411764705882|
|          2|Divya|   Finance|2025-07-24 10:00:00|2025-07-24 16:30:00|       6.5|      1|              1|0.15384615384615385|
|          3| Ravi|        HR|2025-07-24 09:15:00|2025-07-24 17:00:00|      7.75|      0|              2|0.25806451612903225|
|          1| Arun|        HR|2025-07-25 09:00:00|2025-07-25 17:45:00|      8.75|      0|              2|0.22857142857142856|
|          2|Divya|   Finance|2025-07-25 10:00:00|2025-07-25 16:00:00|       6.0|      1|              1|0.16666666666

In [11]:
dept_summary = final_df.groupBy("department").agg(
    round(avg("work_hours"),2).alias("avg_hours"),
    round(avg("is_late"),2).alias("late_rate"),
    round(avg("tasks_completed"),2).alias("avg_tasks"),
    round(avg("productivity"),2).alias("avg_productivity")
)

dept_summary.show()


+----------+---------+---------+---------+----------------+
|department|avg_hours|late_rate|avg_tasks|avg_productivity|
+----------+---------+---------+---------+----------------+
|        HR|      8.0|      0.0|      2.0|            0.25|
|   Finance|     6.25|      1.0|      1.0|            0.16|
+----------+---------+---------+---------+----------------+



In [15]:

import shutil
shutil.rmtree("department_summary.csv", ignore_errors=True)
dept_summary.toPandas().to_csv("department_summary.csv", index=False)

from google.colab import files
files.download("department_summary.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>