In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W

In [0]:
spark = SparkSession.builder.appName("NB-1").getOrCreate()
spark

#**Dataset**

In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

In [0]:
performance = [
("Ananya", 2023, 4.5),
("Rahul", 2023, 4.9),
("Priya", 2023, 4.3),
("Zoya", 2023, 3.8),
("Karan", 2023, 4.1),
("Naveen", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

#**Groupby and Aggregations**

In [0]:
# 1. Get the average salary by department.
df.groupby("Department").agg(
    F.mean("Salary").alias("Average Salary")
).show()

+-----------+--------------+
| Department|Average Salary|
+-----------+--------------+
|         HR|       52500.0|
|Engineering|       65000.0|
|  Marketing|       46500.0|
+-----------+--------------+



In [0]:
# 2. Count of employees per department.
df.groupby("Department").agg(
    F.count("Name").alias("Employee count")
).show()

+-----------+--------------+
| Department|Employee count|
+-----------+--------------+
|         HR|             2|
|Engineering|             3|
|  Marketing|             2|
+-----------+--------------+



In [0]:
# 3. Maximum and minimum salary in Engineering.
df.filter(df.Department == "Engineering").agg(
    F.min("Salary").alias("Min Salary"),
    F.max("Salary").alias("Max Salary")
).show()

+----------+----------+
|Min Salary|Max Salary|
+----------+----------+
|     60000|     70000|
+----------+----------+



#**Join and Combine Data**

In [0]:
# 4. Perform an inner join between employee_data and performance_data on Name .
df_joined = df.join(df_perf, on="Name", how="inner")
df_joined.show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
| Rahul|Engineering| 65000|2023|   4.9|
| Priya|Engineering| 60000|2023|   4.3|
|  Zoya|  Marketing| 48000|2023|   3.8|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
|Fatima|  Marketing| 45000|2023|   3.9|
+------+-----------+------+----+------+



In [0]:
# 5. Show each employee’s salary and performance rating.
df_joined.select(["Name", "Salary", "Rating"]).show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
| Rahul| 65000|   4.9|
| Priya| 60000|   4.3|
|  Zoya| 48000|   3.8|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
|Fatima| 45000|   3.9|
+------+------+------+



In [0]:
# 6. Filter employees with rating > 4.5 and salary > 60000.
df_joined.select(["Name", "Salary", "Rating"]).filter((df_joined.Salary > 60_000) & (df_joined.Rating > 4.5)).show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
| Rahul| 65000|   4.9|
|Naveen| 70000|   4.7|
+------+------+------+



#**Window & Rank**

In [0]:
# 7. Rank employees by salary department-wise.
emps = W.partitionBy("Department").orderBy(F.desc("Salary"))
df.withColumn("Rank", F.rank().over(emps)).show()

+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
|Naveen|Engineering| 70000|   1|
| Rahul|Engineering| 65000|   2|
| Priya|Engineering| 60000|   3|
| Karan|         HR| 53000|   1|
|Ananya|         HR| 52000|   2|
|  Zoya|  Marketing| 48000|   1|
|Fatima|  Marketing| 45000|   2|
+------+-----------+------+----+



In [0]:
# 8. Calculate cumulative salary in each department.
salry = W.partitionBy("Department").orderBy("Salary").rowsBetween(W.unboundedPreceding, W.currentRow)

df.withColumn("Cumulative income", F.sum("Salary").over(salry)).show()

+------+-----------+------+-----------------+
|  Name| Department|Salary|Cumulative income|
+------+-----------+------+-----------------+
| Priya|Engineering| 60000|            60000|
| Rahul|Engineering| 65000|           125000|
|Naveen|Engineering| 70000|           195000|
|Ananya|         HR| 52000|            52000|
| Karan|         HR| 53000|           105000|
|Fatima|  Marketing| 45000|            45000|
|  Zoya|  Marketing| 48000|            93000|
+------+-----------+------+-----------------+



#**Date Operations**

In [0]:
# 9. Add a new column JoinDate with random dates between 2020 and 2023.
df = df.withColumn("JoinDate", F.date_add(
    F.lit("2020-01-01"),
    F.floor(F.rand() * (365 * 4)).cast("int")
))
df.show()

+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|         HR| 52000|2022-12-14|
| Rahul|Engineering| 65000|2020-04-26|
| Priya|Engineering| 60000|2023-08-26|
|  Zoya|  Marketing| 48000|2020-04-18|
| Karan|         HR| 53000|2023-06-26|
|Naveen|Engineering| 70000|2022-06-06|
|Fatima|  Marketing| 45000|2020-10-02|
+------+-----------+------+----------+



In [0]:
# 10. Add column YearsWithCompany using current_date() and datediff() .
df.withColumn("YearWithCompany", F.round((F.date_diff(F.current_date(), df.JoinDate) / 365))).show()

+------+-----------+------+----------+---------------+
|  Name| Department|Salary|  JoinDate|YearWithCompany|
+------+-----------+------+----------+---------------+
|Ananya|         HR| 52000|2022-12-14|            2.0|
| Rahul|Engineering| 65000|2020-04-26|            5.0|
| Priya|Engineering| 60000|2023-08-26|            2.0|
|  Zoya|  Marketing| 48000|2020-04-18|            5.0|
| Karan|         HR| 53000|2023-06-26|            2.0|
|Naveen|Engineering| 70000|2022-06-06|            3.0|
|Fatima|  Marketing| 45000|2020-10-02|            5.0|
+------+-----------+------+----------+---------------+



#**Writing to Files**

In [0]:
# 11. Write the full employee DataFrame to CSV with headers.
df.write.mode("overwrite").option("header", "true").csv("employee")

In [0]:
# 12. Save the joined DataFrame to a Parquet file.
df_joined.write.mode("overwrite").parquet("df_joined")