In [0]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window 

In [0]:
spark = SparkSession.builder.appName("Revision").getOrCreate()

In [0]:
%sql
CREATE TABLE com_worker ( worker_id BIGINT , department VARCHAR(25), first_name VARCHAR(25), last_name VARCHAR(25), joining_date DATE, salary BIGINT);
INSERT INTO com_worker (worker_id, department, first_name, last_name, joining_date, salary) VALUES  (1, 'HR', 'John', 'Doe', '2020-01-15', 50000), (2, 'IT', 'Jane', 'Smith', '2019-03-10', 60000), (3, 'Finance', 'Emily', 'Jones', '2021-06-20', 75000), (4, 'Sales', 'Michael', 'Brown', '2018-09-05', 60000), (5, 'Marketing', 'Chris', 'Johnson', '2022-04-12', 70000), (6, 'IT', 'David', 'Wilson', '2020-11-01', 80000), (7, 'Finance', 'Sarah', 'Taylor', '2017-05-25', 45000), (8, 'HR', 'James', 'Anderson', '2023-01-09', 65000), (9, 'Sales', 'Anna', 'Thomas', '2020-02-18', 55000), (10, 'Marketing', 'Robert', 'Jackson', '2021-07-14', 60000);


num_affected_rows,num_inserted_rows
10,10


You have been asked to find the fifth highest salary without using TOP or LIMIT. Note: Duplicate salaries should not be removed.

In [0]:
%sql
with ranked_data as 
(select salary, dense_rank() over (order by salary desc) as rank
from com_worker
)
select salary, rank
from ranked_data
where rank = 5;

salary,rank
60000,5
60000,5
60000,5


In [0]:
%sql
CREATE TABLE sf_exchange_rate ( date DATE, exchange_rate FLOAT, source_currency VARCHAR(10), target_currency VARCHAR(10));
INSERT INTO sf_exchange_rate (date, exchange_rate, source_currency, target_currency) VALUES ('2020-01-15', 1.1, 'EUR', 'USD'), ('2020-01-15', 1.3, 'GBP', 'USD'), ('2020-02-05', 1.2, 'EUR', 'USD'), ('2020-02-05', 1.35, 'GBP', 'USD'), ('2020-03-25', 1.15, 'EUR', 'USD'), ('2020-03-25', 1.4, 'GBP', 'USD'), ('2020-04-15', 1.2, 'EUR', 'USD'), ('2020-04-15', 1.45, 'GBP', 'USD'), ('2020-05-10', 1.1, 'EUR', 'USD'), ('2020-05-10', 1.3, 'GBP', 'USD'), ('2020-06-05', 1.05, 'EUR', 'USD'), ('2020-06-05', 1.25, 'GBP', 'USD');
CREATE TABLE sf_sales_amount ( currency VARCHAR(10), sales_amount BIGINT, sales_date DATE);
INSERT INTO sf_sales_amount (currency, sales_amount, sales_date) VALUES ('USD', 1000, '2020-01-15'), ('EUR', 2000, '2020-01-20'), ('GBP', 1500, '2020-02-05'), ('USD', 2500, '2020-02-10'), ('EUR', 1800, '2020-03-25'), ('GBP', 2200, '2020-03-30'), ('USD', 3000, '2020-04-15'), ('EUR', 1700, '2020-04-20'), ('GBP', 2000, '2020-05-10'), ('USD', 3500, '2020-05-25'), ('EUR', 1900, '2020-06-05'), ('GBP', 2100, '2020-06-10');


num_affected_rows,num_inserted_rows
12,12


In [0]:
%sql
select * from sf_exchange_rate

date,exchange_rate,source_currency,target_currency
2020-01-15,1.1,EUR,USD
2020-01-15,1.3,GBP,USD
2020-02-05,1.2,EUR,USD
2020-02-05,1.35,GBP,USD
2020-03-25,1.15,EUR,USD
2020-03-25,1.4,GBP,USD
2020-04-15,1.2,EUR,USD
2020-04-15,1.45,GBP,USD
2020-05-10,1.1,EUR,USD
2020-05-10,1.3,GBP,USD


In [0]:
%sql
select * from sf_sales_amount

currency,sales_amount,sales_date
USD,1000,2020-01-15
EUR,2000,2020-01-20
GBP,1500,2020-02-05
USD,2500,2020-02-10
EUR,1800,2020-03-25
GBP,2200,2020-03-30
USD,3000,2020-04-15
EUR,1700,2020-04-20
GBP,2000,2020-05-10
USD,3500,2020-05-25


You work for a multinational company that wants to calculate total sales across all their countries they do business in.
You have 2 tables, one is a record of sales for all countries and currencies the company deals with, and the other holds currency exchange rate information. Calculate the total sales, per quarter, for the first 2 quarters in 2020, and report the sales in USD currency.

In [0]:
%sql
select QUARTER (sales.sales_date)as QuareterSales, 
round(sum(sales.sales_amount* rate.exchange_rate),2) as TotalSales
from sf_exchange_rate as rate
join sf_sales_amount as sales
on rate.source_currency = sales.currency
and rate.target_currency= "USD"
and sales.sales_date = rate.date
where sales.sales_date >= "2020-01-01" and sales.sales_date <= "2020-06-30"
group by QUARTER(sales.sales_date)


QuareterSales,TotalSales
1,4095.0
2,4595.0


In [0]:
%sql

DROP TABLE IF EXISTS workers;
DROP TABLE IF EXISTS titles;

CREATE TABLE workers (
    department STRING,
    first_name STRING,
    joining_date DATE,
    last_name STRING,
    salary BIGINT,
    worker_id BIGINT
);
INSERT INTO workers (department, first_name, joining_date, last_name, salary, worker_id)
VALUES  
    ('HR', 'Alice', '2020-01-15', 'Smith', 60000, 1), 
    ('Engineering', 'Bob', '2019-03-10', 'Johnson', 80000, 2), 
    ('Sales', 'Charlie', '2021-07-01', 'Brown', 50000, 3), 
    ('Engineering', 'David', '2018-12-20', 'Wilson', 90000, 4), 
    ('Marketing', 'Emma', '2020-06-30', 'Taylor', 70000, 5);

CREATE TABLE titles (
    affected_from DATE,
    worker_ref_id BIGINT,
    worker_title STRING
);
INSERT INTO titles (affected_from, worker_ref_id, worker_title)
VALUES  
    ('2020-01-15', 1, 'HR Manager'), 
    ('2019-03-10', 2, 'Software Engineer'), 
    ('2021-07-01', 3, 'Sales Representative'), 
    ('2018-12-20', 4, 'Engineering Manager'), 
    ('2020-06-30', 5, 'Marketing Specialist'), 
    ('2022-01-01', 5, 'Marketing Manager');


num_affected_rows,num_inserted_rows
6,6


In [0]:
%sql
select * from workers


department,first_name,joining_date,last_name,salary,worker_id
HR,Alice,2020-01-15,Smith,60000,1
Engineering,Bob,2019-03-10,Johnson,80000,2
Sales,Charlie,2021-07-01,Brown,50000,3
Engineering,David,2018-12-20,Wilson,90000,4
Marketing,Emma,2020-06-30,Taylor,70000,5


In [0]:
%sql
select *  from titles

affected_from,worker_ref_id,worker_title
2020-01-15,1,HR Manager
2019-03-10,2,Software Engineer
2021-07-01,3,Sales Representative
2018-12-20,4,Engineering Manager
2020-06-30,5,Marketing Specialist
2022-01-01,5,Marketing Manager


Find all employees who have or had a job title that includes manager.
Output the first name along with the corresponding title.

In [0]:
%sql
select w.first_name , t.worker_title
from workers w
join titles t
on w.worker_id = t.worker_ref_id
where lower(t.worker_title) like "%manager%"


first_name,worker_title
Alice,HR Manager
David,Engineering Manager
Emma,Marketing Manager
