#**Assignment-3**

In [0]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("dbshell-01").getOrCreate()

# **Basics**

In [0]:
# 1. Load the CSV using inferred schema.
dfEmp = spark.read.csv("/FileStore/tables/employee_timesheet.csv", header=True, inferSchema=True)
dfEmp.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)



In [0]:
# 2. Load the same file with schema explicitly defined.
schema = StructType(
    [
        StructField("EmployeeID", StringType()),
        StructField("Name", StringType()),
        StructField("Department", StringType()),
        StructField("Project", StringType()),
        StructField("WorkHours", IntegerType()),
        StructField("WorkDate", DateType()),
        StructField("Location", StringType()),
        StructField("Mode", StringType())
    ]
)
dfExpSchema = spark.read.csv("/FileStore/tables/employee_timesheet.csv", header=True, schema=schema)
dfExpSchema.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)



In [0]:
# 3. Add a new column Weekday extracted from WorkDate .
dfEmp = dfEmp.withColumn("WeekDay", F.dayofweek(F.col("WorkDate")))
dfEmp.show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|
+----------+-----+----------+-------+---------+----------+---------+------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|      4|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|      4|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|      5|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|      6|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|
+----------+-----+----------+-------+---------+----------+---------+------+-------+



# **Aggregations & Grouping**

In [0]:
# 4. Calculate total work hours by employee.
dfEmp.groupBy(["EmployeeID", "Name"]).agg(
  F.sum("WorkHours").alias("TotalHours")
).show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+



In [0]:
# 5. Calculate average work hours per department.
dfEmp.groupBy("Department").agg(
    F.round(F.mean("WorkHours"), 2).alias("AverageWorkHour")
).show()

+----------+---------------+
|Department|AverageWorkHour|
+----------+---------------+
|        HR|            7.5|
|   Finance|            5.0|
|        IT|           7.67|
+----------+---------------+



In [0]:
# 6. Get top 2 employees by total hours using window function.
win = W.orderBy(F.desc("TotalHours"))
dfEmp.groupBy(["EmployeeID", "Name"]) \
    .agg(F.sum("WorkHours").alias("TotalHours")) \
    .withColumn("Rank", F.rank().over(win)) \
    .sort("Rank") \
    .show(2)

+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+
only showing top 2 rows



# **Date Operations**

In [0]:
# 7. Filter entries where WorkDate falls on a weekend.
dfEmp.filter((dfEmp.WeekDay == 6) | (dfEmp.WeekDay == 7)).show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|
+----------+-----+----------+-------+---------+----------+---------+------+-------+
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|      6|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|
+----------+-----+----------+-------+---------+----------+---------+------+-------+



In [0]:
# 8. Calculate running total of hours per employee using window.
win3 = W.partitionBy("EmployeeID").orderBy("WorkDate")

dfEmp.withColumn("RunningTotal", F.sum("WorkHours").over(win3)).show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+------+-------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|      4|           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|      4|           7|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|          15|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|      5|           5|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|      6|           6|
+----------+-----+----------+-------+---------+----------+---------+------+-------+------------+



# **Joining DataFrames**

In [0]:
data = [("IT", "Anand"), ("HR", "Shruthi"), ("Finance", "Kamal")]
columns = ["Department", "DeptHead"]
department_location = spark.createDataFrame(data, columns)
department_location.show()

+----------+--------+
|Department|DeptHead|
+----------+--------+
|        IT|   Anand|
|        HR| Shruthi|
|   Finance|   Kamal|
+----------+--------+



In [0]:
# 10. Join with timesheet data and list all employees with their DeptHead.
dfJoined = dfEmp.join(department_location, on="Department", how="inner")
dfJoined.select(["Name", "Department", "DeptHead"]).show()

+-----+----------+--------+
| Name|Department|DeptHead|
+-----+----------+--------+
|Meena|        IT|   Anand|
|Anita|        IT|   Anand|
|Anita|        IT|   Anand|
|  Raj|        HR| Shruthi|
|  Raj|        HR| Shruthi|
| John|   Finance|   Kamal|
+-----+----------+--------+



# **Pivot & Unpivot**

In [0]:
# 11. Pivot table: total hours per employee per project.
dfPivot = dfEmp.groupBy("Name") \
  .pivot("Project") \
  .agg(F.sum("WorkHours")) \
  .fillna(0)

dfPivot.show()

+-----+-----+----+-----+
| Name|Alpha|Beta|Gamma|
+-----+-----+----+-----+
| John|    5|   0|    0|
|Anita|   17|   0|    0|
|  Raj|    0|  15|    0|
|Meena|    0|   0|    6|
+-----+-----+----+-----+



In [0]:
# 12. Unpivot example: Convert mode-specific hours into rows.
dfPivot.unpivot("Name", ["Alpha", "Beta", "Gamma"], "Project", "TotalHours").show()

+-----+-------+----------+
| Name|Project|TotalHours|
+-----+-------+----------+
| John|  Alpha|         5|
| John|   Beta|         0|
| John|  Gamma|         0|
|Anita|  Alpha|        17|
|Anita|   Beta|         0|
|Anita|  Gamma|         0|
|  Raj|  Alpha|         0|
|  Raj|   Beta|        15|
|  Raj|  Gamma|         0|
|Meena|  Alpha|         0|
|Meena|   Beta|         0|
|Meena|  Gamma|         6|
+-----+-------+----------+



# **UDF & Conditional Logic**

In [0]:
# 13. Create a UDF to classify work hours:
def workloadTag(hours):
  if hours >= 8:
    return "Full"
  if hours >=4:
    return "Partial"
  return "Light"

tagger = F.udf(workloadTag, StringType())

In [0]:
# 14. Add a column WorkloadCategory using this UDF.
dfEmp.withColumn("WorkLoadCategory", tagger(F.col("WorkHours"))).show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|WorkLoadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+-------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|      4|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|      4|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|      5|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|      6|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|            Full|
+----------+-----+----------+-------+---------+----------+---------+------+-------+--------

# **Nulls and Cleanup**

In [0]:
# 15. Introduce some nulls in Mode column.
dfNulls = dfEmp.withColumn("Mode", F.when(dfEmp.Mode == "Onsite", None).otherwise(dfEmp.Mode))
dfNulls.show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|
+----------+-----+----------+-------+---------+----------+---------+------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|      4|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|  null|      4|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|      5|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|  null|      6|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|
+----------+-----+----------+-------+---------+----------+---------+------+-------+



In [0]:
# 16. Fill nulls with "Not Provided".
dfNulls = dfNulls.fillna("Not Provided", subset="Mode")
dfNulls.show()

+----------+-----+----------+-------+---------+----------+---------+------------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|WeekDay|
+----------+-----+----------+-------+---------+----------+---------+------------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|      4|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|      4|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote|      5|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|      6|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Not Provided|      6|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote|      7|
+----------+-----+----------+-------+---------+----------+---------+------------+-------+



In [0]:
# 17. Drop rows where WorkHours < 4.
dfNew = dfNulls.filter(dfNulls.WorkHours >= 4)
dfNew.show()

+----------+-----+----------+-------+---------+----------+---------+------------+-------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|WeekDay|
+----------+-----+----------+-------+---------+----------+---------+------------+-------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|      4|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|      4|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote|      5|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|      6|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Not Provided|      6|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote|      7|
+----------+-----+----------+-------+---------+----------+---------+------------+-------+



# **Advanced Conditions**

In [0]:
# 18. Use when-otherwise to mark employees as "Remote Worker" if >80% entries are Remote.
dfEmp.groupBy(["EmployeeID", "Name"]) \
  .agg(
  F.sum(F.when(F.col("Mode") == "Remote", 1).otherwise(0)).alias("RemoteWork"),
  F.count("Mode").alias("TotalWork")
) \
.withColumn("Percentage", F.when((F.col("RemoteWork") / F.col("TotalWork")) > 0.80, "RemoteWorker").otherwise("NotRemote"))  \
.show()

+----------+-----+----------+---------+------------+
|EmployeeID| Name|RemoteWork|TotalWork|  Percentage|
+----------+-----+----------+---------+------------+
|      E103| John|         1|        1|RemoteWorker|
|      E104|Meena|         0|        1|   NotRemote|
|      E102|  Raj|         1|        2|   NotRemote|
|      E101|Anita|         2|        2|RemoteWorker|
+----------+-----+----------+---------+------------+



In [0]:
# 19. Add a new column ExtraHours where hours > 8.
dfEmp.withColumn("ExtraHours", F.when(F.col("WorkHours") > 8, F.col("WorkHours") - 8).otherwise(0)).show()

+----------+-----+----------+-------+---------+----------+---------+------+-------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|WeekDay|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------+-------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|      4|         0|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|      4|         0|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|      5|         0|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|      6|         1|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|      6|         0|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|      7|         0|
+----------+-----+----------+-------+---------+----------+---------+------+-------+----------+



# **Union + Duplicate Handling**

In [0]:
# 20. Append a dummy timesheet for new interns using unionByName() .
interData = [
  ("E108","Ram","IT","Alpha",8,"2024-05-01","Bangalore","Remote"),
  ("E109","Eren","HR","Beta",7,"2024-05-01","Mumbai","Onsite")
]
columns = ["EmployeeID","Name","Department","Project","WorkHours","WorkDate","Location","Mode"]

dfInterns = spark.createDataFrame(interData, columns)

dfInEmp = dfEmp.drop("WeekDay").unionByName(dfInterns)
dfInEmp.show()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
|      E108|  Ram|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E109| Eren|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
+----------+-----+----------+-------+---------+----------+---------+------+



In [0]:
# 21. Remove duplicate rows based on all columns.
dfInEmp.dropDuplicates().show()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E108|  Ram|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E109| Eren|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
+----------+-----+----------+-------+---------+----------+---------+------+

