# Self Join
## Find employees with salary more than their managers

In [0]:
%sql
-- Switch to my Catalog
USE CATALOG workspace;

-- Create schema if not exists
CREATE SCHEMA IF NOT EXISTS sql_pyspark_practice;

-- Use this schema
USE sql_pyspark_practice;

In [0]:
%sql
create or replace table emp(emp_id int,emp_name varchar(10),salary int ,manager_id int);

insert into emp values(1,'Ankit',10000,4);
insert into emp values(2,'Mohit',15000,5);
insert into emp values(3,'Vikas',10000,4);
insert into emp values(4,'Rohit',5000,2);
insert into emp values(5,'Mudit',12000,6);
insert into emp values(6,'Agam',12000,2);
insert into emp values(7,'Sanjay',9000,2);
insert into emp values(8,'Ashish',5000,2);

select * from emp;

## SQL Solution

In [0]:
%sql
select a.emp_id as employee_id, a.emp_name as employee_name, a.salary as employee_salary, b.emp_name as manager_name, b.salary as manager_salary
from emp a
join emp b on a.manager_id=b.emp_id
where a.salary > b.salary;

## PySpark Solution

In [0]:
%python

# Read Source table
df = spark.table("emp")
display(df)

# Self Join the table
# Assign Alias then join them
df_emp = df.alias("e")
df_mgr = df.alias("m")

df_joined = (
    df_emp.join(df_mgr, df_emp.manager_id == df_mgr.emp_id, "inner")
    .select(
        "e.emp_id", "e.emp_name", "e.salary",
        "m.emp_name", "m.salary"
    )
    .where(df_emp.salary > df_mgr.salary)
)
display(df_joined)

## Learnings
- Learnt about self join in PyS
- Usage of where condition in PyS