In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Employee_data").getOrCreate()

In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)


In [0]:
performance = [
("Ananya", 2023, 4.5),
("Rahul", 2023, 4.9),
("Priya", 2023, 4.3),
("Zoya", 2023, 3.8),
("Karan", 2023, 4.1),
("Naveen", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

In [0]:
project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)


In [0]:
# Joins and Advanced Aggregations
# 1. Join employee_data , performance_data , and project_data .
full_join = df.join(df_perf, on="Name", how="inner").join(df_proj, on="Name", how="inner")

full_join.show(truncate=False)

# 2. Compute total hours worked per department.
full_join.groupBy("Department").sum("HoursWorked").withColumnRenamed("sum(HoursWorked)", "TotalHoursWorked").show()

# 3. Compute average rating per project.
full_join.groupBy("Project").avg("Rating").withColumnRenamed("avg(Rating)", "AverageRating").show()


+------+-----------+------+----+------+----------------+-----------+
|Name  |Department |Salary|Year|Rating|Project         |HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|HR         |52000 |2023|4.5   |HR Portal       |120        |
|Rahul |Engineering|65000 |2023|4.9   |Data Platform   |200        |
|Priya |Engineering|60000 |2023|4.3   |Data Platform   |180        |
|Zoya  |Marketing  |48000 |2023|3.8   |Campaign Tracker|100        |
|Karan |HR         |53000 |2023|4.1   |HR Portal       |130        |
|Naveen|Engineering|70000 |2023|4.7   |ML Pipeline     |220        |
|Fatima|Marketing  |45000 |2023|3.9   |Campaign Tracker|90         |
+------+-----------+------+----+------+----------------+-----------+

+-----------+----------------+
| Department|TotalHoursWorked|
+-----------+----------------+
|         HR|             250|
|Engineering|             600|
|  Marketing|             190|
+-----------+----------------+

+----------------+----

In [0]:
# Handling Missing Data
# 4. Add a row to performance_data with a None rating.
from pyspark.sql import Row
from pyspark.sql.functions import col

new_row1 = spark.createDataFrame([("Ramya", "HR", None)], df.schema)
df = df.union(new_row1)

new_row2 = spark.createDataFrame([("Ramya", 2023, None)], df_perf.schema)
df_perf = df_perf.union(new_row2)

df.show()
df_perf.show()

# 5. Filter rows with null values.

df.filter(col("Salary").isNull()).show()
df_perf.filter(col("Rating").isNull()).show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Ramya|         HR|  NULL|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
| Ramya|2023|  NULL|
+------+----+------+

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
|Ramya|        HR|  NULL|
+-----+----------+------+

+-----+----+------+
| Name|Year|Rating|
+-----+----+------+
|Ramya|2023|  NULL|
+-----+----+------+



In [0]:
# 6. Replace null ratings with the department average.
from pyspark.sql.functions import col, avg, when

df_join = df_perf.join(df.select("Name", "Department"), on="Name", how="left")

dept_avg = df_join.groupBy("Department").agg(avg("Rating").alias("DeptAvg"))

final_df = df_join.join(dept_avg, on="Department", how="left")

final_df = final_df.withColumn("Rating", when(col("Rating").isNull(), col("DeptAvg")).otherwise(col("Rating")))

final_df.select("Name", "Department", "Rating").show()


+------+-----------+------+
|  Name| Department|Rating|
+------+-----------+------+
|Ananya|         HR|   4.5|
| Rahul|Engineering|   4.9|
| Priya|Engineering|   4.3|
|  Zoya|  Marketing|   3.8|
| Karan|         HR|   4.1|
|Naveen|Engineering|   4.7|
|Fatima|  Marketing|   3.9|
| Ramya|         HR|   4.3|
+------+-----------+------+



In [0]:
# Built-In Functions and UDF
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)

from pyspark.sql.functions import when

perf_category = df_perf.withColumn(
    "PerformanceCategory",when(col("Rating") >= 4.7, "Excellent")
    .when((col("Rating") >= 4.0) & (col("Rating") < 4.7), "Good")
    .when(col("Rating") < 4.0, "Average")
)
perf_category.show()

# 8. Create a UDF to assign bonus:
# If project hours > 200 →
# 10,000
# Else →
# 5,000
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def assign_bonus(hours):
    return 10000 if hours and hours > 200 else 5000

bonus_udf = udf(assign_bonus, IntegerType())

df_proj.withColumn("Bonus", bonus_udf(col("HoursWorked"))).show()



+------+----+------+-------------------+
|  Name|Year|Rating|PerformanceCategory|
+------+----+------+-------------------+
|Ananya|2023|   4.5|               Good|
| Rahul|2023|   4.9|          Excellent|
| Priya|2023|   4.3|               Good|
|  Zoya|2023|   3.8|            Average|
| Karan|2023|   4.1|               Good|
|Naveen|2023|   4.7|          Excellent|
|Fatima|2023|   3.9|            Average|
| Ramya|2023|  NULL|               NULL|
+------+----+------+-------------------+

+------+----------------+-----------+-----+
|  Name|         Project|HoursWorked|Bonus|
+------+----------------+-----------+-----+
|Ananya|       HR Portal|        120| 5000|
| Rahul|   Data Platform|        200| 5000|
| Priya|   Data Platform|        180| 5000|
|  Zoya|Campaign Tracker|        100| 5000|
| Karan|       HR Portal|        130| 5000|
|Naveen|     ML Pipeline|        220|10000|
|Fatima|Campaign Tracker|         90| 5000|
+------+----------------+-----------+-----+



In [0]:
# Date and Time Functions
# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as difference from today.
from pyspark.sql.functions import *

df_joined = df.withColumn("JoinDate", to_date(lit("2021-06-01")))

df_joined = df_joined.withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")).cast("int"))

df_joined.select("Name", "JoinDate", "MonthsWorked").show()

# 10. Calculate how many employees joined before 2022.
from pyspark.sql.functions import year

df_joined.filter(year("JoinDate") < 2022).count()


+------+----------+------------+
|  Name|  JoinDate|MonthsWorked|
+------+----------+------------+
|Ananya|2021-06-01|          48|
| Rahul|2021-06-01|          48|
| Priya|2021-06-01|          48|
|  Zoya|2021-06-01|          48|
| Karan|2021-06-01|          48|
|Naveen|2021-06-01|          48|
|Fatima|2021-06-01|          48|
| Ramya|2021-06-01|          48|
+------+----------+------------+



8

In [0]:
# Unions
# 11. Create another small team DataFrame and union() it with employee_data .
# extra_employees = [
# ("Meena", "HR", 48000),
# ("Raj", "Marketing", 51000)]
# Create extra employee DataFrame using existing schema
extra_emp = [("Meena", "HR", 48000), ("Raj", "Marketing", 51000)]
df_extra = spark.createDataFrame(extra_emp, df.schema)

df.union(df_extra).show()



+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Ramya|         HR|  NULL|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



In [0]:
# Saving Results
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based
# on Department .

df_join = df.join(df_perf, on="Name", how="inner")

df_final = df_join.join(df_proj, on="Name", how="inner")
df_final.show()
# Save to Parquet partitioned by Department
df_final.write.mode("overwrite").partitionBy("Department").parquet("/tmp/final_employee_data")


+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+

