In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.window import Window as W

In [0]:
spark = SparkSession.builder.appName("NB-1").getOrCreate()
spark

#**Dataset**

In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

In [0]:
performance = [
("Ananya", 2023, 4.5),
("Rahul", 2023, 4.9),
("Priya", 2023, 4.3),
("Zoya", 2023, 3.8),
("Karan", 2023, 4.1),
("Naveen", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

In [0]:
project_data = [
("Ananya", "HR Portal", 120),
("Rahul", "Data Platform", 200),
("Priya", "Data Platform", 180),
("Zoya", "Campaign Tracker", 100),
("Karan", "HR Portal", 130),
("Naveen", "ML Pipeline", 220),
("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)

#**Joins and Advanced Aggregations**

In [0]:
# 1. Join employee_data , performance_data , and project_data .
df_joined = df.join(df_perf, on="Name", how="inner").join(df_proj, on="Name", how="inner")
df_joined.show()

+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



In [0]:
# 2. Compute total hours worked per department.
df_joined.groupby("Department").agg(
    F.sum("HoursWorked").alias("Total Hours")
).show()

+-----------+-----------+
| Department|Total Hours|
+-----------+-----------+
|         HR|        250|
|Engineering|        600|
|  Marketing|        190|
+-----------+-----------+



In [0]:
# 3. Compute average rating per project.
df_joined.groupby("Project").agg(
    F.round(F.mean("Rating"), 1).alias("Average rating")
).show()

+----------------+--------------+
|         Project|Average rating|
+----------------+--------------+
|       HR Portal|           4.3|
|   Data Platform|           4.6|
|Campaign Tracker|           3.8|
|     ML Pipeline|           4.7|
+----------------+--------------+



#**Handling Missing Data**

In [0]:
# 4. Add a row to performance_data with a None rating.
row = spark.createDataFrame(Row(["Tharun", 2023, None]), df_perf.schema)
df_perf = df_perf.union(row)
df_perf.show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
|Tharun|2023|  NULL|
+------+----+------+



In [0]:
# 5. Filter rows with null values.
df_perf.filter(df_perf.Rating.isNull()).show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Tharun|2023|  NULL|
+------+----+------+



In [0]:
# 6. Replace null ratings with the department average.
deptAverage = df_joined.groupby("Department").agg(
    F.mean("Rating").alias("Average Rating")
)
deptAverage = deptAverage.filter(deptAverage.Department == "Engineering").first()[1]

df_perf.fillna(round(deptAverage, 1), subset="Rating").show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
|Tharun|2023|   4.6|
+------+----+------+



#**Built-In Functions and UDF**

In [0]:
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)
df_joined.withColumn(
  "PerformanceCategory",
  F.when(df_joined.Rating >= 4.7, "Excellent").when(df_joined.Rating < 4.0, "Average").otherwise("Good")
  ).select(["Name", "Rating", "PerformanceCategory"]).show()

+------+------+-------------------+
|  Name|Rating|PerformanceCategory|
+------+------+-------------------+
|Ananya|   4.5|               Good|
| Rahul|   4.9|          Excellent|
| Priya|   4.3|               Good|
|  Zoya|   3.8|            Average|
| Karan|   4.1|               Good|
|Naveen|   4.7|          Excellent|
|Fatima|   3.9|            Average|
+------+------+-------------------+



In [0]:
# 8. Create a UDF to assign bonus:
# If project hours > 200 → 10,000
# Else → 5,000
def bonousAssigner(project_hours):
    if project_hours >= 200:
        return 10_000
    return 5_000

bonous = F.udf(bonousAssigner)

df_joined.withColumn("Bonous", bonous("HoursWorked")).show()

+------+-----------+------+----+------+----------------+-----------+------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|Bonous|
+------+-----------+------+----+------+----------------+-----------+------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|  5000|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200| 10000|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|  5000|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|  5000|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|  5000|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220| 10000|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|  5000|
+------+-----------+------+----+------+----------------+-----------+------+



#**Date and Time Functions**

In [0]:
# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as difference from today.
df = df.withColumn("JoinDate", F.lit("2021-06-01").cast("date"))
df = df.withColumn("MonthsWorked", F.round(F.months_between(F.current_date(), df.JoinDate)))
df.show()

+------+-----------+------+----------+------------+
|  Name| Department|Salary|  JoinDate|MonthsWorked|
+------+-----------+------+----------+------------+
|Ananya|         HR| 52000|2021-06-01|        48.0|
| Rahul|Engineering| 65000|2021-06-01|        48.0|
| Priya|Engineering| 60000|2021-06-01|        48.0|
|  Zoya|  Marketing| 48000|2021-06-01|        48.0|
| Karan|         HR| 53000|2021-06-01|        48.0|
|Naveen|Engineering| 70000|2021-06-01|        48.0|
|Fatima|  Marketing| 45000|2021-06-01|        48.0|
+------+-----------+------+----------+------------+



In [0]:
# 10. Calculate how many employees joined before 2022.
df.filter(F.year(df.JoinDate) < 2022).show()

+------+-----------+------+----------+------------+
|  Name| Department|Salary|  JoinDate|MonthsWorked|
+------+-----------+------+----------+------------+
|Ananya|         HR| 52000|2021-06-01|        48.0|
| Rahul|Engineering| 65000|2021-06-01|        48.0|
| Priya|Engineering| 60000|2021-06-01|        48.0|
|  Zoya|  Marketing| 48000|2021-06-01|        48.0|
| Karan|         HR| 53000|2021-06-01|        48.0|
|Naveen|Engineering| 70000|2021-06-01|        48.0|
|Fatima|  Marketing| 45000|2021-06-01|        48.0|
+------+-----------+------+----------+------------+



#**Unions**

In [0]:
# 11. Create another small team DataFrame and union() it with employee_data .
extra_employees = [
("Meena", "HR", 48000, None, None),
("Raj", "Marketing", 51000, None, None)
]
small_team = spark.createDataFrame(extra_employees, df.schema)

df = df.union(small_team)
df.show()

+------+-----------+------+----------+------------+
|  Name| Department|Salary|  JoinDate|MonthsWorked|
+------+-----------+------+----------+------------+
|Ananya|         HR| 52000|2021-06-01|        48.0|
| Rahul|Engineering| 65000|2021-06-01|        48.0|
| Priya|Engineering| 60000|2021-06-01|        48.0|
|  Zoya|  Marketing| 48000|2021-06-01|        48.0|
| Karan|         HR| 53000|2021-06-01|        48.0|
|Naveen|Engineering| 70000|2021-06-01|        48.0|
|Fatima|  Marketing| 45000|2021-06-01|        48.0|
| Meena|         HR| 48000|      NULL|        NULL|
|   Raj|  Marketing| 51000|      NULL|        NULL|
+------+-----------+------+----------+------------+



#**Saving Results**

In [0]:
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based on Department .
df_joined.write.mode("overwrite").parquet("df_joined_Data", partitionBy="Department")