LOAD DATA

In [0]:
attendance_df = spark.read.csv("/Volumes/workspace/default/nithyashree/attendance_logs.csv", header=True, inferSchema=True)
tasks_df = spark.read.csv("/Volumes/workspace/default/nithyashree/tasks.csv", header=True, inferSchema=True)

CLEAN DATA

In [0]:
from pyspark.sql.functions import col, to_timestamp

attendance_df = attendance_df.withColumn("clockin", to_timestamp(col("clockin"), "yyyy-MM-dd HH:mm:ss")) \
                             .withColumn("clockout", to_timestamp(col("clockout"), "yyyy-MM-dd HH:mm:ss"))

attendance_df = attendance_df.withColumn("workhours", 
                        (col("clockout").cast("long") - col("clockin").cast("long")) / 3600)

JOIN DATA

In [0]:
combined_df = attendance_df.join(tasks_df, on="employeeid", how="inner")

Aggregate Department-Level Metrics

In [0]:
from pyspark.sql.functions import avg, count, col, unix_timestamp

# Create a new derived column "workhours"
combined_df = combined_df.withColumn(
    "workhours",
    (unix_timestamp("clockout") - unix_timestamp("clockin")) / 3600
)

In [0]:
combined_df = combined_df.withColumn("productivity_score", col("taskscompleted"))

In [0]:
from pyspark.sql.functions import avg, count, col, unix_timestamp

# Step 1: Add workhours
combined_df = combined_df.withColumn(
    "workhours",
    (unix_timestamp("clockout") - unix_timestamp("clockin")) / 3600.0
)

# Step 2: Add productivity_score using taskscompleted
combined_df = combined_df.withColumn("productivity_score", col("taskscompleted"))

# Step 3: Compute department-level KPIs
department_kpis = combined_df.groupBy("department").agg(
    avg("workhours").alias("avg_workhours"),
    avg("productivity_score").alias("avg_productivity"),
    count(col("employeeid")).alias("records_count")
)

# Step 4: Show results
department_kpis.show()

+----------+-----------------+----------------+-------------+
|department|    avg_workhours|avg_productivity|records_count|
+----------+-----------------+----------------+-------------+
|        HR|              8.5|             3.0|            2|
|  Accounts|8.166666666666666|             2.0|            1|
|        IT|8.416666666666666|             4.5|            2|
+----------+-----------------+----------------+-------------+



SAVE OUTPUT

In [0]:
department_kpis.write.format("delta").mode("overwrite").save("/Volumes/workspace/default/nithyashree/department_kpis_delta")

In [0]:
department_kpis.write.mode("overwrite").parquet("/Volumes/workspace/default/nithyashree/department_kpis_parquet")