In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("EmployeeTimesheet").getOrCreate()
df = spark.read.option("header", True).option("inferSchema", True).csv("employee_timesheet.csv")
df.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)



In [2]:
schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])
df_explicit = spark.read.schema(schema).option("header", True).csv("employee_timesheet.csv")


In [3]:
df_explicit = df_explicit.withColumn("Weekday", date_format("WorkDate", "EEEE"))


Aggregations & Grouping

In [4]:
df_explicit.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()


+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+



In [5]:
df_explicit.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()


+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+



In [6]:
windowSpec = Window.orderBy(col("TotalHours").desc())
df_hours = df_explicit.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
df_hours.withColumn("Rank", dense_rank().over(windowSpec)).filter(col("Rank") <= 2).show()


+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



Date Operations

In [7]:
df_explicit.filter(col("Weekday").isin("Saturday", "Sunday")).show()

+----------+----+----------+-------+---------+----------+--------+------+--------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode| Weekday|
+----------+----+----------+-------+---------+----------+--------+------+--------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|Saturday|
+----------+----+----------+-------+---------+----------+--------+------+--------+



In [8]:
windowEmp = Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, 0)
df_running = df_explicit.withColumn("RunningTotal", sum("WorkHours").over(windowEmp))
df_running.select("EmployeeID", "WorkDate", "WorkHours", "RunningTotal").show()


+----------+----------+---------+------------+
|EmployeeID|  WorkDate|WorkHours|RunningTotal|
+----------+----------+---------+------------+
|      E101|2024-05-01|        8|           8|
|      E101|2024-05-03|        9|          17|
|      E102|2024-05-01|        7|           7|
|      E102|2024-05-04|        8|          15|
|      E103|2024-05-02|        5|           5|
|      E104|2024-05-03|        6|           6|
+----------+----------+---------+------------+



Joining DataFrames

In [9]:
dept_df = spark.read.option("header", True).csv("department_location.csv")


In [10]:
df_joined = df_explicit.join(dept_df, on="Department", how="left")
df_joined.select("EmployeeID", "Name", "Department", "DeptHead").show()


+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



Pivot & Unpivot

In [11]:
df_explicit.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()


+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



In [12]:
df_mode_pivot = df_explicit.groupBy("EmployeeID").pivot("Mode").agg(sum("WorkHours"))
df_unpivot = df_mode_pivot.select(
    col("EmployeeID"),
    expr("stack(2, 'Remote', Remote, 'Onsite', Onsite) as (ModeType, Hours)")
)
df_unpivot.show()

+----------+--------+-----+
|EmployeeID|ModeType|Hours|
+----------+--------+-----+
|      E103|  Remote|    5|
|      E103|  Onsite| NULL|
|      E104|  Remote| NULL|
|      E104|  Onsite|    6|
|      E101|  Remote|   17|
|      E101|  Onsite| NULL|
|      E102|  Remote|    8|
|      E102|  Onsite|    7|
+----------+--------+-----+



UDF & Conditional Logic

In [13]:
def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

tag_udf = udf(workload_tag, StringType())

In [14]:
df_tagged = df_explicit.withColumn("WorkloadCategory", tag_udf(col("WorkHours")))
df_tagged.select("EmployeeID", "WorkHours", "WorkloadCategory").show()


+----------+---------+----------------+
|EmployeeID|WorkHours|WorkloadCategory|
+----------+---------+----------------+
|      E101|        8|            Full|
|      E102|        7|         Partial|
|      E103|        5|         Partial|
|      E101|        9|            Full|
|      E104|        6|         Partial|
|      E102|        8|            Full|
+----------+---------+----------------+



Nulls and Cleanup

In [15]:
df_with_nulls = df_explicit.withColumn("Mode", when(col("EmployeeID") == "E102", None).otherwise(col("Mode")))


In [16]:
df_filled = df_with_nulls.fillna({"Mode": "Not Provided"})
df_filled.select("EmployeeID", "Mode").show()


+----------+------------+
|EmployeeID|        Mode|
+----------+------------+
|      E101|      Remote|
|      E102|Not Provided|
|      E103|      Remote|
|      E101|      Remote|
|      E104|      Onsite|
|      E102|Not Provided|
+----------+------------+



In [17]:
df_filtered = df_filled.filter(col("WorkHours") >= 4)
df_filtered.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Not Provided| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+



Advanced Conditions

In [18]:
remote_ratio = df_explicit.groupBy("EmployeeID").agg(
    (sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio")
)
df_flagged = df_explicit.join(remote_ratio, "EmployeeID")
df_flagged = df_flagged.withColumn("WorkerType", when(col("RemoteRatio") > 0.8, "Remote Worker").otherwise("Mixed"))
df_flagged.select("EmployeeID", "RemoteRatio", "WorkerType").distinct().show()


+----------+-----------+-------------+
|EmployeeID|RemoteRatio|   WorkerType|
+----------+-----------+-------------+
|      E103|        1.0|Remote Worker|
|      E104|        0.0|        Mixed|
|      E101|        1.0|Remote Worker|
|      E102|        0.5|        Mixed|
+----------+-----------+-------------+



In [19]:
df_extra = df_explicit.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
df_extra.select("EmployeeID", "WorkHours", "ExtraHours").show()


+----------+---------+----------+
|EmployeeID|WorkHours|ExtraHours|
+----------+---------+----------+
|      E101|        8|         0|
|      E102|        7|         0|
|      E103|        5|         0|
|      E101|        9|         1|
|      E104|        6|         0|
|      E102|        8|         0|
+----------+---------+----------+



Union + Duplicate Handling

In [21]:
from pyspark.sql.functions import lit
intern_data = [
    ("E200", "Tina", "IT", "Alpha", 4, "2024-05-05", "Chennai", "Remote")
]
intern_schema = ["EmployeeID", "Name", "Department", "Project", "WorkHours", "WorkDate", "Location", "Mode"]
intern_df = spark.createDataFrame(intern_data, intern_schema).withColumn("WorkDate", to_date("WorkDate"))
intern_df = intern_df.withColumn("Weekday", date_format("WorkDate", "EEEE"))
df_combined = df_explicit.unionByName(intern_df)
df_combined.show()


+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
|      E200| Tina|        IT|  Alpha|        4|2024-05-05|  Chennai|Remote|   Sunday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+

