In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AggrGrouping").getOrCreate()


employee_data = [
    ("Ananya", "Engineering", 60000),
    ("Rahul", "HR", 50000),
    ("Priya", "Engineering", 70000),
    ("Zoya", "Marketing", 45000),
    ("Karan", "Engineering", 75000),
    ("Naveen", "HR", 52000),
    ("Fatima", "Marketing", 48000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)

# Performance data
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)


#GroupBy and Aggregations

In [3]:
from pyspark.sql.functions import *

# 1. Get the average salary by department.
df_emp.groupBy("Department").agg(avg("Salary").alias("Avg_Salary")).show()


+-----------+-----------------+
| Department|       Avg_Salary|
+-----------+-----------------+
|Engineering|68333.33333333333|
|         HR|          51000.0|
|  Marketing|          46500.0|
+-----------+-----------------+



In [4]:
# 2. Count of employees per department.
df_emp.groupBy("Department").agg(count("*").alias("Employee_Count")).show()


+-----------+--------------+
| Department|Employee_Count|
+-----------+--------------+
|Engineering|             3|
|         HR|             2|
|  Marketing|             2|
+-----------+--------------+



In [5]:
# 3. Maximum and minimum salary in Engineering.
df_emp.filter(col("Department") == "Engineering") .agg(max("Salary").alias("Max_Salary"), min("Salary").alias("Min_Salary")) .show()


+----------+----------+
|Max_Salary|Min_Salary|
+----------+----------+
|     75000|     60000|
+----------+----------+



 # Join and Combine Data

In [6]:
# 4. Perform an inner join between employee_data and performance_data on Name.
df_joined = df_emp.join(df_perf, on="Name", how="inner")
df_joined.show()


+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|Engineering| 60000|2023|   4.5|
|Fatima|  Marketing| 48000|2023|   3.9|
| Karan|Engineering| 75000|2023|   4.1|
|Naveen|         HR| 52000|2023|   4.7|
| Priya|Engineering| 70000|2023|   4.3|
| Rahul|         HR| 50000|2023|   4.9|
|  Zoya|  Marketing| 45000|2023|   3.8|
+------+-----------+------+----+------+



In [7]:
# 5. Show each employee’s salary and performance rating.
df_joined.select("Name", "Salary", "Rating").show()


+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 60000|   4.5|
|Fatima| 48000|   3.9|
| Karan| 75000|   4.1|
|Naveen| 52000|   4.7|
| Priya| 70000|   4.3|
| Rahul| 50000|   4.9|
|  Zoya| 45000|   3.8|
+------+------+------+



In [13]:
# 6. Filter employees with rating > 4.5 and salary > 60000.
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()


+----+----------+------+----+------+
|Name|Department|Salary|Year|Rating|
+----+----------+------+----+------+
+----+----------+------+----+------+



#Window & Rank (Bonus Challenge)

In [14]:
# 7. Rank employees by salary department-wise.
windowSpec = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_emp.withColumn("Salary_Rank", rank().over(windowSpec)).show()


+------+-----------+------+-----------+
|  Name| Department|Salary|Salary_Rank|
+------+-----------+------+-----------+
| Karan|Engineering| 75000|          1|
| Priya|Engineering| 70000|          2|
|Ananya|Engineering| 60000|          3|
|Naveen|         HR| 52000|          1|
| Rahul|         HR| 50000|          2|
|Fatima|  Marketing| 48000|          1|
|  Zoya|  Marketing| 45000|          2|
+------+-----------+------+-----------+



In [16]:
from pyspark.sql.functions import sum

# 8. Calculate cumulative salary in each department.
windowSpecCum = Window.partitionBy("Department").orderBy("Salary") \
                      .rowsBetween(Window.unboundedPreceding, 0)
df_emp.withColumn("Cumulative_Salary", sum("Salary").over(windowSpecCum)).show()


+------+-----------+------+-----------------+
|  Name| Department|Salary|Cumulative_Salary|
+------+-----------+------+-----------------+
|Ananya|Engineering| 60000|            60000|
| Priya|Engineering| 70000|           130000|
| Karan|Engineering| 75000|           205000|
| Rahul|         HR| 50000|            50000|
|Naveen|         HR| 52000|           102000|
|  Zoya|  Marketing| 45000|            45000|
|Fatima|  Marketing| 48000|            93000|
+------+-----------+------+-----------------+



#Date Operations

In [17]:
# 9. Add a new column JoinDate with random dates between 2020 and 2023.
from datetime import datetime, timedelta
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType

def random_date():
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2023, 12, 31)
    return start_date + timedelta(days=random.randint(0, (end_date - start_date).days))

random_date_udf = udf(random_date, DateType())
df_emp = df_emp.withColumn("JoinDate", random_date_udf())
df_emp.show()


+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|Engineering| 60000|2021-05-02|
| Rahul|         HR| 50000|2023-08-16|
| Priya|Engineering| 70000|2020-06-25|
|  Zoya|  Marketing| 45000|2020-12-10|
| Karan|Engineering| 75000|2022-05-07|
|Naveen|         HR| 52000|2020-01-08|
|Fatima|  Marketing| 48000|2022-07-21|
+------+-----------+------+----------+



In [18]:
# 10. Add column YearsWithCompany using current_date() and datediff().
df_emp = df_emp.withColumn("YearsWithCompany", (datediff(current_date(), col("JoinDate")) / 365).cast("int"))
df_emp.show()


+------+-----------+------+----------+----------------+
|  Name| Department|Salary|  JoinDate|YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|Engineering| 60000|2022-03-30|               3|
| Rahul|         HR| 50000|2021-05-24|               4|
| Priya|Engineering| 70000|2021-07-03|               3|
|  Zoya|  Marketing| 45000|2023-05-20|               2|
| Karan|Engineering| 75000|2020-06-21|               4|
|Naveen|         HR| 52000|2021-05-26|               4|
|Fatima|  Marketing| 48000|2023-03-18|               2|
+------+-----------+------+----------+----------------+



#Writing to Files

In [19]:
# 11. Write the full employee DataFrame to CSV with headers.
df_emp.write.option("header", True).csv("output/employee_csv")


In [20]:
# 12. Save the joined DataFrame to a Parquet file.
df_joined.write.parquet("output/joined_data.parquet")
