In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Employee_data").getOrCreate()

In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

In [0]:
performance = [
("Ananya", 2023, 4.5),
("Rahul", 2023, 4.9),
("Priya", 2023, 4.3),
("Zoya", 2023, 3.8),
("Karan", 2023, 4.1),
("Naveen", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)



In [0]:
# GroupBy and Aggregations
from pyspark.sql.functions import *
# 1. Get the average salary by department.
df.groupBy("Department").avg("Salary").withColumnRenamed("avg(Salary)", "AverageSalary").show()

# 2. Count of employees per department.
df.groupBy("Department").agg(count("*").alias("NoOfEmployees")).show()

# 3. Maximum and minimum salary in Engineering.
df.filter(col("Department") == "Engineering").agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()

+-----------+-------------+
| Department|AverageSalary|
+-----------+-------------+
|         HR|      52500.0|
|Engineering|      65000.0|
|  Marketing|      46500.0|
+-----------+-------------+

+-----------+-------------+
| Department|NoOfEmployees|
+-----------+-------------+
|         HR|            2|
|Engineering|            3|
|  Marketing|            2|
+-----------+-------------+

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



In [0]:
# Join and Combine Data
# 4. Perform an inner join between employee_data and performance_data on Name .
df_joined = df.join(df_perf, on="Name", how="inner")

# 5. Show each employee’s salary and performance rating.
df_joined.select("Name", "Salary", "Rating").show()

# 6. Filter employees with rating > 4.5 and salary > 60000.
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()


+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



In [0]:
# Window & Rank (Bonus Challenge)
# 7. Rank employees by salary department-wise.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum, col

_rank = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_ranked = df.withColumn("Rank", rank().over(_rank))
df_ranked.select("Name", "Department", "Salary", "Rank").show()


# 8. Calculate cumulative salary in each department.
cumsum = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df.withColumn("CumulativeSalary", _sum("Salary").over(cumsum)).show()


+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
| Priya|Engineering| 60000|           60000|
| Rahul|Engineering| 65000|          125000|
|Naveen|Engineering| 70000|          195000|
|Ananya|         HR| 52000|           52000|
| Karan|         HR| 53000|          105000|
|Fatima|  Marketing| 45000|           45000|
|  Zoya|  Marketing| 48000|           93000|
+------+-----------+------+----------------+



In [0]:
# Date Operations
# 9. Add a new column JoinDate with random dates between 2020 and 2023.
from pyspark.sql.functions import expr

# Add JoinDate with random days added to a base date (2020-01-01)
joindate = df.withColumn(
    "JoinDate",expr("date_add(date('2020-01-01'), cast(rand() * 1460 as int))")
)
joindate.show(truncate=False)

# 10. Add column YearsWithCompany using current_date() and datediff() .
from pyspark.sql.functions import current_date, datediff, round

years = df_with_joindate.withColumn(
    "YearsWithCompany",round(datediff(current_date(), col("JoinDate")) / 365.0, 1)
)
years.show()



+------+-----------+------+----------+
|Name  |Department |Salary|JoinDate  |
+------+-----------+------+----------+
|Ananya|HR         |52000 |2022-11-29|
|Rahul |Engineering|65000 |2022-09-05|
|Priya |Engineering|60000 |2021-09-14|
|Zoya  |Marketing  |48000 |2023-05-05|
|Karan |HR         |53000 |2020-11-02|
|Naveen|Engineering|70000 |2022-11-27|
|Fatima|Marketing  |45000 |2022-04-06|
+------+-----------+------+----------+

+------+-----------+------+----------+----------------+
|  Name| Department|Salary|  JoinDate|YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|         HR| 52000|2023-10-26|             1.6|
| Rahul|Engineering| 65000|2023-07-11|             1.9|
| Priya|Engineering| 60000|2020-07-27|             4.9|
|  Zoya|  Marketing| 48000|2023-10-18|             1.6|
| Karan|         HR| 53000|2020-01-19|             5.4|
|Naveen|Engineering| 70000|2021-01-25|             4.4|
|Fatima|  Marketing| 45000|2021-06-11|             4.0|
+------+--

In [0]:
# Writing to Files
# 11. Write the full employee DataFrame to CSV with headers.
years.write.option("header", True).csv("employee_details.csv")

# 12. Save the joined DataFrame to a Parquet file.
df_joined.write.parquet("employee_performance.parquet")
