In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count, when, rank, sum as _sum
from pyspark.sql.functions import current_date, datediff, lit
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
import random
from datetime import date, timedelta
spark = SparkSession.builder.appName("AdvancedEmployeeAnalytics").getOrCreate()
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)


GroupBy and Aggregations

In [0]:
df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|         HR|  52500.0|
|Engineering|  65000.0|
|  Marketing|  46500.0|
+-----------+---------+



In [0]:
df_emp.groupBy("Department").agg(count("*").alias("EmployeeCount")).show()

+-----------+-------------+
| Department|EmployeeCount|
+-----------+-------------+
|         HR|            2|
|Engineering|            3|
|  Marketing|            2|
+-----------+-------------+



In [0]:
df_emp.filter(col("Department") == "Engineering") \
      .agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



Join and Combine Data

In [0]:
df_joined = df_emp.join(df_perf, on="Name", how="inner")

In [0]:
df_joined.select("Name", "Salary", "Rating").show()


+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+



In [0]:
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



 Bonus Challenge: Window & Rank

In [0]:
window_dept = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_ranked = df_emp.withColumn("SalaryRank", rank().over(window_dept))
df_ranked.select("Name", "Department", "Salary", "SalaryRank").show()

+------+-----------+------+----------+
|  Name| Department|Salary|SalaryRank|
+------+-----------+------+----------+
|Naveen|Engineering| 70000|         1|
| Rahul|Engineering| 65000|         2|
| Priya|Engineering| 60000|         3|
| Karan|         HR| 53000|         1|
|Ananya|         HR| 52000|         2|
|  Zoya|  Marketing| 48000|         1|
|Fatima|  Marketing| 45000|         2|
+------+-----------+------+----------+



In [0]:
df_cum = df_emp.withColumn("CumulativeSalary", _sum("Salary").over(window_dept.rowsBetween(Window.unboundedPreceding, Window.currentRow)))
df_cum.select("Name", "Department", "Salary", "CumulativeSalary").show()

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
|Naveen|Engineering| 70000|           70000|
| Rahul|Engineering| 65000|          135000|
| Priya|Engineering| 60000|          195000|
| Karan|         HR| 53000|           53000|
|Ananya|         HR| 52000|          105000|
|  Zoya|  Marketing| 48000|           48000|
|Fatima|  Marketing| 45000|           93000|
+------+-----------+------+----------------+



Date Operations

In [0]:
from pyspark.sql.functions import to_date
def random_date(start_year=2020, end_year=2023):
    start = date(start_year, 1, 1)
    end = date(end_year, 12, 31)
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))
join_dates = [random_date() for _ in range(df_emp.count())]
df_dates = spark.createDataFrame([(d,) for d in join_dates], ["JoinDate"]).withColumn("JoinDate", col("JoinDate").cast(DateType()))
df_emp = df_emp.withColumn("row_id", lit(1)).join(df_dates.withColumn("row_id", lit(1)), "row_id").drop("row_id")



In [0]:
df_emp = df_emp.withColumn("YearsWithCompany", (datediff(current_date(), col("JoinDate")) / 365).cast("int"))
df_emp.select("Name", "JoinDate", "YearsWithCompany").show()

+------+----------+----------------+
|  Name|  JoinDate|YearsWithCompany|
+------+----------+----------------+
|Ananya|2023-11-24|               1|
|Ananya|2023-09-21|               1|
|Ananya|2022-02-22|               3|
|Ananya|2021-05-09|               4|
|Ananya|2020-03-14|               5|
|Ananya|2022-10-08|               2|
|Ananya|2020-12-19|               4|
| Rahul|2023-11-24|               1|
| Priya|2023-11-24|               1|
| Rahul|2023-09-21|               1|
| Rahul|2022-02-22|               3|
| Priya|2023-09-21|               1|
| Priya|2022-02-22|               3|
| Rahul|2021-05-09|               4|
| Rahul|2020-03-14|               5|
| Priya|2021-05-09|               4|
| Priya|2020-03-14|               5|
| Rahul|2022-10-08|               2|
| Rahul|2020-12-19|               4|
| Priya|2022-10-08|               2|
+------+----------+----------------+
only showing top 20 rows



Writing to Files

In [0]:
df_emp.write.mode("overwrite").option("header", "true").csv("/tmp/employee_data_csv")


In [0]:

df_joined.write.mode("overwrite").parquet("/tmp/employee_performance_parquet")