In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from datetime import date, timedelta
import random

# 1. SparkSession
spark = SparkSession.builder.appName("AdvancedEmployeeAnalysis").getOrCreate()


In [0]:

# 2. Dataset 1: employee_data
employee_data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)


In [0]:

# 3. Dataset 2: performance_data
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)


In [0]:

# GroupBy and Aggregations
df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()
df_emp.groupBy("Department").count().show()
df_emp.filter(col("Department") == "Engineering") \
      .agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()


+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|         HR|  52500.0|
|Engineering|  65000.0|
|  Marketing|  46500.0|
+-----------+---------+

+-----------+-----+
| Department|count|
+-----------+-----+
|         HR|    2|
|Engineering|    3|
|  Marketing|    2|
+-----------+-----+

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



In [0]:

# Join and Combine
df_joined = df_emp.join(df_perf, on="Name", how="inner")
df_joined.select("Name", "Salary", "Rating").show()
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()


+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



In [0]:

# Window & Rank
window_rank = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_emp.withColumn("Rank", rank().over(window_rank)).show()


+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+



In [0]:

# Cumulative Salary
df_emp.withColumn("CumulativeSalary", sum("Salary").over(window_rank)).show()


+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
|Naveen|Engineering| 70000|           70000|
| Rahul|Engineering| 65000|          135000|
| Priya|Engineering| 60000|          195000|
| Karan|         HR| 53000|           53000|
|Ananya|         HR| 52000|          105000|
|  Zoya|  Marketing| 48000|           48000|
|Fatima|  Marketing| 45000|           93000|
+------+-----------+------+----------------+



In [0]:

# Date Operations
def random_date():
    start = date(2020, 1, 1)
    end = date(2023, 12, 31)
    return start + timedelta(days=random.randint(0, (end - start).days))

# Create random join dates
random_dates = [(name, random_date()) for name, _, _ in employee_data]
df_dates = spark.createDataFrame(random_dates, ["Name", "JoinDate"])

df_with_dates = df_emp.join(df_dates, on="Name")
df_with_dates.withColumn(
    "YearsWithCompany",
    round(datediff(current_date(), col("JoinDate")) / 365, 1)
).show()


+------+-----------+------+----------+----------------+
|  Name| Department|Salary|  JoinDate|YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|         HR| 52000|2021-02-10|             4.3|
|Fatima|  Marketing| 45000|2023-03-24|             2.2|
| Karan|         HR| 53000|2022-12-29|             2.5|
|Naveen|Engineering| 70000|2020-05-16|             5.1|
| Priya|Engineering| 60000|2020-08-25|             4.8|
| Rahul|Engineering| 65000|2020-06-15|             5.0|
|  Zoya|  Marketing| 48000|2021-12-03|             3.5|
+------+-----------+------+----------+----------------+



In [0]:

# Saving to Files
# 1. CSV with headers
df_emp.write.mode("overwrite").option("header", "true").csv("/tmp/employee_data_export")

# 2. Parquet file
df_joined.write.mode("overwrite").parquet("/tmp/employee_perf_joined")
