# Employee Project Analysis using PySpark

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, DateType
from datetime import date

spark = SparkSession.builder.appName("EmployeeProjectAnalysis").getOrCreate()

## 1. Employee Data

In [0]:
employee_data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)
df_emp.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



## 2. Performance Data

In [0]:
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



## 3. Project Data

In [0]:
project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)
df_proj.show()

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



## 4. Join All

In [0]:
df_joined = df_emp.join(df_perf, "Name").join(df_proj, "Name")
df_joined.show()

+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



## 5. Total Hours by Department

In [0]:
df_joined.groupBy("Department").agg(sum("HoursWorked").alias("TotalHours")).show()

+-----------+----------+
| Department|TotalHours|
+-----------+----------+
|         HR|       250|
|Engineering|       600|
|  Marketing|       190|
+-----------+----------+



## 6. Average Rating per Project

In [0]:
df_joined.groupBy("Project").agg(avg("Rating").alias("AvgRating")).show()

+----------------+------------------+
|         Project|         AvgRating|
+----------------+------------------+
|       HR Portal|               4.3|
|   Data Platform|               4.6|
|Campaign Tracker|3.8499999999999996|
|     ML Pipeline|               4.7|
+----------------+------------------+



## 7. Add NULL row

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define schema explicitly
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Rating", DoubleType(), True)
])

# Create new DataFrame with explicit schema
new_row = [("Meena", 2023, None)]
df_new = spark.createDataFrame(new_row, schema)

# Union with existing DataFrame
df_perf_null = df_perf.union(df_new)
df_perf_null.show()


## 8. Filter NULL

In [0]:
df_perf_null.filter(col("Rating").isNull()).show()

## 9. Replace NULL with Department Avg

In [0]:
df_perf_joined = df_perf_null.join(df_emp, "Name", "left")
avg_dept = df_perf_joined.groupBy("Department").agg(avg("Rating").alias("DeptAvg"))
df_perf_filled = df_perf_joined.join(avg_dept, "Department", "left").withColumn(
    "Rating",
    when(col("Rating").isNull(), col("DeptAvg")).otherwise(col("Rating"))
).drop("DeptAvg")
df_perf_filled.show()

## 10. Performance Category

In [0]:
df_perf_filled.withColumn(
    "PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when(col("Rating") >= 4.0, "Good")
    .otherwise("Average")
).show()

## 11. Bonus Using UDF

In [0]:
def bonus(hours): return 10000 if hours > 200 else 5000
bonus_udf = udf(bonus, IntegerType())
df_joined.withColumn("Bonus", bonus_udf(col("HoursWorked"))).select("Name", "HoursWorked", "Bonus").show()

## 12. Join Date and Months Worked

In [0]:
df_emp.withColumn("JoinDate", lit("2021-06-01").cast(DateType())).withColumn(
    "MonthsWorked", months_between(current_date(), col("JoinDate")).cast("int")
).show()

## 13. Count Joined Before 2022

In [0]:
df_emp.withColumn("JoinDate", lit("2021-06-01").cast("date")) \
      .filter(col("JoinDate") < "2022-01-01").count()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPySparkValueError[0m                         Traceback (most recent call last)
File [0;32m<command-2760075282785116>, line 2[0m
[1;32m      1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m [38;5;28;01mimport[39;00m Row
[0;32m----> 2[0m df_perf_null [38;5;241m=[39m df_perf[38;5;241m.[39munion(spark[38;5;241m.[39mcreateDataFrame([Row(Name[38;5;241m=[39m[38;5;124m"[39m[38;5;124mMeena[39m[38;5;124m"[39m, Year[38;5;241m=[39m[38;5;241m2023[39m, Rating[38;5;241m=[39m[38;5;28;01mNone[39;00m)]))
[1;32m      3[0m df_perf_null[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     45[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     46[0m [38;5;28;01mtry[39;00m

## 14. Union with Extra Employees

In [0]:
extra = [("Meena", "HR", 48000), ("Raj", "Marketing", 51000)]
df_extra = spark.createDataFrame(extra, columns_emp)
df_union = df_emp.union(df_extra)
df_union.show()

## 15. Save Final Join to Partitioned Parquet

In [0]:
# Rename columns to avoid ambiguity
df_emp = df_emp.withColumnRenamed("Salary", "Emp_Salary")
df_perf_clean = df_perf_clean.withColumnRenamed("Salary", "Perf_Salary")

# Join the DataFrames cleanly
df_final = df_emp.join(
    df_perf_clean,
    on="Name",
    how="left"
).join(
    df_proj,
    on="Name",
    how="left"
)

# Confirm only one 'Department' exists now
df_final.select(
    "Name",
    "Department",
    "Emp_Salary",
    "Rating",
    "Project",
    "HoursWorked"
).show()

# Save as partitioned Parquet file
df_final.write.mode("overwrite").partitionBy("Department").parquet("/tmp/final_employee_data")