In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, max, min, when, rank, sum as _sum, current_date, datediff
from pyspark.sql.window import Window
from pyspark.sql.functions import lit, expr
import random
from datetime import datetime, timedelta

spark = SparkSession.builder.appName("AdvancedEmployeeData").getOrCreate()

# Employee dataset
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)

# Performance dataset
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
# Define the random date generator function
def random_date():
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2023, 12, 31)
    delta = end_date - start_date
    rand_days = random.randint(0, delta.days)
    return (start_date + timedelta(days=rand_days)).strftime("%Y-%m-%d")

**GroupBy and Aggregations**


In [2]:
# 1. Average salary by department
df_emp.groupBy("Department").agg(avg("Salary").alias("AverageSalary")).show()

+-----------+-------------+
| Department|AverageSalary|
+-----------+-------------+
|Engineering|      65000.0|
|         HR|      52500.0|
|  Marketing|      46500.0|
+-----------+-------------+



In [3]:
# 2. Count of employees per department
df_emp.groupBy("Department").agg(count("Name").alias("EmployeeCount")).show()

+-----------+-------------+
| Department|EmployeeCount|
+-----------+-------------+
|Engineering|            3|
|         HR|            2|
|  Marketing|            2|
+-----------+-------------+



In [4]:
# 3. Maximum and minimum salary in Engineering
df_emp.filter(col("Department") == "Engineering").agg(
    max("Salary").alias("MaxSalary"),
    min("Salary").alias("MinSalary")
).show()

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



**Join and Combine Data**

In [5]:
# 4. Inner join on Name
df_joined = df_emp.join(df_perf, on="Name", how="inner")

In [6]:
# 5. Each employee’s salary and performance rating
df_joined.select("Name", "Salary", "Rating").show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+



In [7]:
# 6. Filter: rating > 4.5 and salary > 60000
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



**Window & Rank**

In [8]:
# 7. Rank employees by salary department-wise
windowDept = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_emp.withColumn("SalaryRank", rank().over(windowDept)).show()

+------+-----------+------+----------+
|  Name| Department|Salary|SalaryRank|
+------+-----------+------+----------+
|Naveen|Engineering| 70000|         1|
| Rahul|Engineering| 65000|         2|
| Priya|Engineering| 60000|         3|
| Karan|         HR| 53000|         1|
|Ananya|         HR| 52000|         2|
|  Zoya|  Marketing| 48000|         1|
|Fatima|  Marketing| 45000|         2|
+------+-----------+------+----------+



In [9]:
# 8. Cumulative salary in each department
windowCumSum = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df_emp.withColumn("CumulativeSalary", _sum("Salary").over(windowCumSum)).show()

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
| Priya|Engineering| 60000|           60000|
| Rahul|Engineering| 65000|          125000|
|Naveen|Engineering| 70000|          195000|
|Ananya|         HR| 52000|           52000|
| Karan|         HR| 53000|          105000|
|Fatima|  Marketing| 45000|           45000|
|  Zoya|  Marketing| 48000|           93000|
+------+-----------+------+----------------+



**Date Operations**

In [13]:
# 9. Add JoinDate (random dates between 2020 and 2023)

# Generate random join dates
join_dates = [random_date() for _ in range(len(data))]

# Append join_dates to each row
from pyspark.sql import Row
rows_with_dates = [Row(Name=row[0], Department=row[1], Salary=row[2], JoinDate=join_dates[i]) for i, row in enumerate(data)]

# Create new DataFrame with JoinDate
df_with_dates = spark.createDataFrame(rows_with_dates)
df_with_dates.show()



+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|         HR| 52000|2021-02-10|
| Rahul|Engineering| 65000|2023-03-26|
| Priya|Engineering| 60000|2022-05-02|
|  Zoya|  Marketing| 48000|2020-07-09|
| Karan|         HR| 53000|2023-09-29|
|Naveen|Engineering| 70000|2021-08-08|
|Fatima|  Marketing| 45000|2021-09-12|
+------+-----------+------+----------+



In [14]:
from pyspark.sql.functions import to_date, current_date, datediff

df_with_dates = df_with_dates.withColumn("JoinDate", to_date("JoinDate", "yyyy-MM-dd"))
df_with_dates = df_with_dates.withColumn("YearsWithCompany", (datediff(current_date(), col("JoinDate")) / 365).cast("int"))
df_with_dates.show()


+------+-----------+------+----------+----------------+
|  Name| Department|Salary|  JoinDate|YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|         HR| 52000|2021-02-10|               4|
| Rahul|Engineering| 65000|2023-03-26|               2|
| Priya|Engineering| 60000|2022-05-02|               3|
|  Zoya|  Marketing| 48000|2020-07-09|               4|
| Karan|         HR| 53000|2023-09-29|               1|
|Naveen|Engineering| 70000|2021-08-08|               3|
|Fatima|  Marketing| 45000|2021-09-12|               3|
+------+-----------+------+----------+----------------+



In [15]:
# 10. YearsWithCompany
from pyspark.sql.functions import to_date

df_final = df_with_dates.withColumn("JoinDate", to_date("JoinDate", "yyyy-MM-dd"))
df_final = df_final.withColumn("YearsWithCompany", (datediff(current_date(), col("JoinDate")) / 365).cast("int"))

**Writing to Files**

In [16]:
# 11. Write full employee DataFrame to CSV with headers
df_final.write.option("header", True).mode("overwrite").csv("/content/employee_data_output.csv")

In [17]:
# 12. Save the joined DataFrame to a Parquet file
df_joined.write.mode("overwrite").parquet("/content/employee_performance.parquet")
