In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window


spark= SparkSession.builder.appName('Read').getOrCreate()

In [0]:
emp_schema = StructType([StructField('id', IntegerType() , True), 
                         StructField('name' ,StringType(), True ),
                         StructField('salary', IntegerType(), True),
                         StructField('departmentId', IntegerType(), True)])
Employee = [
    (1, 'Joe', 85000, 1),
    (2, 'Henry', 80000, 2),
    (3, 'Sam', 60000, 2),
    (4, 'Max', 90000, 1),
    (5, 'Janet', 69000, 1),
    (6, 'Randy', 85000, 1),
    (7, 'Will', 70000, 1)
]

df1 = spark.createDataFrame(Employee, emp_schema)
df1.display()


dept_schema= StructType([StructField('id', IntegerType(), True),
                         StructField('name', StringType(), True)])
Department = [
    (1, 'IT'),
    (2, 'Sales')
]
df2= spark.createDataFrame(Department, dept_schema)
df2.display()

id,name,salary,departmentId
1,Joe,85000,1
2,Henry,80000,2
3,Sam,60000,2
4,Max,90000,1
5,Janet,69000,1
6,Randy,85000,1
7,Will,70000,1


id,name
1,IT
2,Sales


A company's executives are interested in seeing who earns the most money in each of the company's departments. A high earner in a department is an employee who has a salary in the top three unique salaries for that department.

Write a solution to find the employees who are high earners in each of the departments.

Return the result table in any order.

In [0]:
df1.createOrReplaceTempView("Employee")
df2.createOrReplaceTempView("Department")

In [0]:
result_df = spark.sql(""" with ranked_salary as  ( select e.name as employee_name, d.name as department_name, e.salary,
          dense_rank() over (partition by e.departmentId order by e.salary desc) as rank
          from employee e join department d on e.departmentId = d.id 
          )

          select department_name, employee_name , salary from ranked_salary where rank <=3
          """)

result_df.display()

department_name,employee_name,salary
IT,Max,90000
IT,Joe,85000
IT,Randy,85000
IT,Will,70000
Sales,Henry,80000
Sales,Sam,60000


In [0]:



joined_df = df1.join(df2, df1.departmentId==df2.id, "inner") \
            .select(df1.name.alias("employee_name"), df2.name.alias("department_name"), df1.salary)

window_spec = Window.partitionBy("department_name").orderBy(desc("salary"))

df_ranked = joined_df.withColumn("rank", dense_rank().over(window_spec))


df_top_3 = df_ranked.filter(df_ranked.rank <= 3)

df_top_3.show()


+-------------+---------------+------+----+
|employee_name|department_name|salary|rank|
+-------------+---------------+------+----+
|          Max|             IT| 90000|   1|
|          Joe|             IT| 85000|   2|
|        Randy|             IT| 85000|   2|
|         Will|             IT| 70000|   3|
|        Henry|          Sales| 80000|   1|
|          Sam|          Sales| 60000|   2|
+-------------+---------------+------+----+

