In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

In [0]:
emp_data = [(1,'manish',50000,'IT','m'),
(2,'vikash',60000,'sales','m'),
(3,'raushan',70000,'marketing','m'),
(4,'mukesh',80000,'IT','m'),
(5,'priti',90000,'sales','f'),
(6,'nikita',45000,'marketing','f'),
(7,'ragini',55000,'marketing','f'),
(8,'rashi',100000,'IT','f'),
(9,'aditya',65000,'IT','m'),
(10,'rahul',50000,'marketing','m'),
(11,'rakhi',50000,'IT','f'),
(12,'akhilesh',90000,'sales','m')]

emp_schema = ["id","name","salary","dept","gender"]

emp_df = spark.createDataFrame(emp_data,emp_schema)

emp_df.createOrReplaceTempView("emp_table")

emp_df.show()

### **Department wise total salary**

In [0]:
emp_df.groupBy(col("dept")).sum("salary").show()

In [0]:
from pyspark.sql.window import Window

### **ROW NUMBER**

In [0]:
window = Window.partitionBy("dept").orderBy("salary")
emp_df.withColumn("row_number",row_number().over(window)).show()

In [0]:
%sql
select id,name,salary,dept,gender,row_number() over(partition by dept order by salary desc) as row_number from emp_table

### **RANK**

In [0]:
window = Window.partitionBy("dept").orderBy("salary")
emp_df.withColumn("rank",rank().over(window)).show()

In [0]:
%sql
select id,name,salary,dept,gender,rank() over(partition by dept order by salary desc) as rank from emp_table

### **DENSE RANK**

In [0]:
window = Window.partitionBy("dept").orderBy("salary")
emp_df.withColumn("dense_rank",dense_rank().over(window)).show()

In [0]:
%sql
select id,name,salary,dept,gender,dense_rank() over(partition by dept order by salary desc) as dense_rank from emp_table

### **MULTIPLE PARTITION BY COLUMNS**

In [0]:
window = Window.partitionBy("gender","dept").orderBy(desc("salary"))
emp_df.withColumn("row_number",row_number().over(window)) \
    .withColumn("rank",rank().over(window)) \
    .withColumn("dense_rank",dense_rank().over(window)) \
    .show()

In [0]:
%sql
select id,name,salary,gender,dept,row_number() over(partition by gender,dept order by salary desc) as row_number,
rank() over(partition by gender,dept order by salary desc) as rank,
dense_rank() over(partition by gender,dept order by salary desc) as dense_rank
from emp_table

### **TOP 2 SALARIED PERSON OF EACH DEPARTMENT**

In [0]:
window = Window.partitionBy("dept").orderBy(desc("salary"))
emp_df.withColumn("row_number",row_number().over(window)) \
    .withColumn("rank",rank().over(window)) \
    .withColumn("dense_rank",dense_rank().over(window)) \
    .filter(col("dense_rank") <= 2) \
    .show()


# to get exact 2 use row_number
emp_df.withColumn("row_number",row_number().over(window)) \
    .withColumn("rank",rank().over(window)) \
    .withColumn("dense_rank",dense_rank().over(window)) \
    .filter(col("row_number") <= 2) \
    .show()

In [0]:
%sql
select id,name,dept,salary from(
select id,name,dept,salary,row_number() over(partition by dept order by salary desc) as r_no
from emp_table) a
where r_no <= 2

### **APPLY SUM OVER EACH WINDOW -- GET DEPT WISE TOTAL EACH ROW**

In [0]:
window = Window.partitionBy("dept")
new_emp_df = emp_df.withColumn("Total_salary",sum("salary").over(window))
new_emp_df.show()

In [0]:
%sql
select e.id,e.name,e.dept,e.salary,a.total_sal from emp_table e
join
(select dept,sum(salary) as total_sal from emp_table group by dept) a
on e.dept = a.dept
order by dept asc

### **GET % OF SALARY GIVEN TO EACH EMPLOYEE OF THEIR DEPARTMENT**

In [0]:
window = Window.partitionBy("dept")
new_emp_df = emp_df.withColumn("Total_salary",sum("salary").over(window)) \
                   .withColumn("% sal of dept",(col("salary")/col("Total_salary") * 100))
new_emp_df.show()

In [0]:
%sql
with dept_total_sal as (select dept,sum(salary) as total_sal from emp_table group by dept)
select e.id,e.name,e.dept,e.salary,a.total_sal,(e.salary/a.total_sal)*100 as prcnt_of_sal
from emp_table e
join dept_total_sal a
on e.dept = a.dept
order by dept

### **GET EMPLOYEE WHOSE SAL < 25% OF TOTAL SAL OF THEIR DEPARTMENT**

In [0]:
window = Window.partitionBy("dept")
new_emp_df = emp_df.withColumn("Total_salary",sum("salary").over(window)) \
                   .filter(col("salary") <= col("Total_salary") * 0.25)
new_emp_df.show()

In [0]:
%sql
with dept_total_sal as (select dept,sum(salary) as total_sal from emp_table group by dept)
select e.id,e.name,e.dept,e.salary,a.total_sal
from emp_table e
join dept_total_sal a
on e.dept = a.dept
where e.salary <= 0.25 * a.total_sal
order by dept

### **GET SECOND HIGHEST SALARIED PERSON**

In [0]:
window = Window.orderBy(desc("salary"))
emp_df.withColumn("dense_rank",dense_rank().over(window)).filter(col("dense_rank") == 2).show()

## if we don't want same rank both then we can use row_number
emp_df.withColumn("row_number",row_number().over(window)).filter(col("row_number") == 2).show()

In [0]:
%sql
select * from(
select id,name,salary,dept,gender,row_number() over(order by salary desc) r_no
from emp_table
)a
where r_no = 2

### **GET SECOND HIGHEST SALARIED PERSON IN EACH DEPARTMENT**

In [0]:
window = Window.partitionBy("dept").orderBy(desc("salary"))
emp_df.withColumn("dense_rank",dense_rank().over(window)).filter(col("dense_rank") == 2).show()

In [0]:
%sql
select * from(
select id,name,salary,dept,gender,dense_rank() over(partition by dept order by salary desc) dense_rank
from emp_table
)a
where dense_rank = 2