In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = (
    SparkSession.builder
    .appName("Working_With_Multiple_Tables")
    .config("spark.jars", "postgresql-42.2.20.jar")
    .getOrCreate()
)

In [3]:
# connect emp table in sqla database of PostgreSQl
def connect_database_to_read_file(database, table, user, password, name_file):
    data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format(database))
        .option("dbtable", table)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver").load())
    return data.createOrReplaceTempView(name_file) 


In [4]:
emp_data = connect_database_to_read_file("sqla", "emp", "postgres", "admin123", "emp")


In [5]:
t1_data = connect_database_to_read_file("sqla", "t1", "postgres", "admin123", "t1")

In [6]:
dept_data = connect_database_to_read_file("sqla", "dept", "postgres", "admin123", "dept")

In [7]:
spark.sql(
    """
    select *
    from dept
    """
).show()

+------+----------+--------+
|deptno|     dname|     loc|
+------+----------+--------+
|    10|ACCOUNTING|NEW YORK|
|    20|  RESEARCH|  DALLAS|
|    30|     SALES| CHICAGO|
|    40|OPERATIONS|  BOSTON|
+------+----------+--------+



In [8]:
spark.sql(
    """
    select ename as ename_and_dname, deptno
    from emp
    where deptno = 10
    union all
    select '----------', null
    from t1
    union all 
    select dname, deptno
    from dept
    """
).show()

+---------------+------+
|ename_and_dname|deptno|
+---------------+------+
|          CLARK|    10|
|           KING|    10|
|         MILLER|    10|
|     ----------|  null|
|     ACCOUNTING|    10|
|       RESEARCH|    20|
|          SALES|    30|
|     OPERATIONS|    40|
+---------------+------+



In [9]:
# UNION and UNION ALL
spark.sql(
    """
    select deptno
    from dept
    union  
    select deptno
    from emp
    """
).show()

+------+
|deptno|
+------+
|  null|
|    10|
|    30|
|    20|
|    40|
+------+



In [10]:
# Problem
# You want to return rows from multiple tables by joining on a known common column or joining on 
# columns that share common values

# For example, you want to display the names of all employees in department 10 along with the location 
# of each employee’s department, but that data is stored in two separate tables.
spark.sql(
    """
    select *
    from emp
    """
).show()

+-----+--------+---------+----+----------+-------+-------+------+
|empno|   ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+--------+---------+----+----------+-------+-------+------+
| 7499|   ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7521|    WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7654|  MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7698|   BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|
| 7782|   CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|
| 7839|    KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|
| 7844|  TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
| 7900|   JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|
| 7934|  MILLER|    CLERK|7782|1982-01-23|1300.00|   null|    10|
|    1|Jonathan|   Editor|null|      null|   null|   null|  null|
| 7369|   SMITH|    CLERK|7902|1980-12-17| 880.00|   null|    20|
| 7566|   JONES|  MANAGER|7839|1981-04-02|3272.50|   null|    20|
| 7788|   

In [11]:
# inner join 
spark.sql(
    """
    select e.ename, d.loc
    from dept as d, emp as e
    where e.deptno = 10 and d.deptno = e.deptno 
    """
).show()

# spark.sql(
#     """
#     select e.ename, d.loc, e.deptno as emp_deptno, d.deptno as dept_detno
#     from emp as e, dept as d
#     where e.deptno = 10
#     """
# ).show()

+------+--------+
| ename|     loc|
+------+--------+
| CLARK|NEW YORK|
|  KING|NEW YORK|
|MILLER|NEW YORK|
+------+--------+



In [12]:
# Join table EMP to view V using multiple join conditions:
data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format('sqla'))
        .option("dbtable", 'emp')
        .option("user", 'postgres')
        .option("password", 'admin123')
        .option("driver", "org.postgresql.Driver").load())

job_clerk = data.select("ename", "job","sal").filter("job == 'CLERK'")
job_clerk.createOrReplaceTempView("job_clerk")


In [13]:
spark.sql(
    """
    select e.empno, jc.ename, jc.job, jc.sal, e.deptno
    from job_clerk as jc, emp as e
    where jc.job = e.job and jc.ename = e.ename and jc.sal = e.sal
    
    """
).show()

+-----+------+-----+-------+------+
|empno| ename|  job|    sal|deptno|
+-----+------+-----+-------+------+
| 7876| ADAMS|CLERK|1210.00|    20|
| 7900| JAMES|CLERK| 950.00|    30|
| 7369| SMITH|CLERK| 880.00|    20|
| 7934|MILLER|CLERK|1300.00|    10|
+-----+------+-----+-------+------+



In [14]:
# Retrieving Values from One Table That Do Not Exist in Another
# Problem
# You want to find which departments (if any) in table DEPT do not exist in table EMP. 
# In the example data, DEPTNO 40 from table DEPT does not exist in table EMP
spark.sql(
    """
    select deptno
    from dept
    except
    select deptno
    from emp
    """
).show()

+------+
|deptno|
+------+
|    40|
+------+



In [15]:
# Problem
# You want to find rows that are in one table that do not have a match in another table, 
# for two tables that have common keys.
# from
# on
# outer 
# where 
spark.sql(
    """
    select d.*, e.deptno
    from dept d left outer join emp e
    on (d.deptno = e.deptno)
    where e.deptno is null
    """
).show()

+------+----------+------+------+
|deptno|     dname|   loc|deptno|
+------+----------+------+------+
|    40|OPERATIONS|BOSTON|  null|
+------+----------+------+------+



In [16]:
# Problem
# you want to return all employees, the location of the department in which they work, 
# and the date they received a bonus.
emp_bonus = connect_database_to_read_file("sqla", "emp_bonus", "postgres", "admin123", "emp_bonus")
spark.sql(
    """
    select *
    from emp_bonus
    
    """
).show()

+-----+----------+----+
|empno|  received|type|
+-----+----------+----+
| 7934|2005-03-17|   1|
| 7934|2005-02-15|   2|
| 7839|2005-02-15|   3|
| 7782|2005-02-15|   1|
+-----+----------+----+



In [17]:
spark.sql(
    """
    select e.ename, d.loc, eb.received
    from emp e join dept d
    on (e.deptno = d.deptno)
    left join emp_bonus eb
    on(e.empno = eb.empno)
    order by 2
    """
).show()

+------+--------+----------+
| ename|     loc|  received|
+------+--------+----------+
|TURNER| CHICAGO|      null|
| ALLEN| CHICAGO|      null|
|MARTIN| CHICAGO|      null|
| JAMES| CHICAGO|      null|
| BLAKE| CHICAGO|      null|
|  WARD| CHICAGO|      null|
| JONES|  DALLAS|      null|
|  FORD|  DALLAS|      null|
| ADAMS|  DALLAS|      null|
| SMITH|  DALLAS|      null|
| SCOTT|  DALLAS|      null|
|  KING|NEW YORK|2005-02-15|
|MILLER|NEW YORK|2005-03-17|
|MILLER|NEW YORK|2005-02-15|
| CLARK|NEW YORK|2005-02-15|
+------+--------+----------+



In [18]:
# Join table EMP to view V using multiple join conditions:
data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format('sqla'))
        .option("dbtable", 'emp')
        .option("user", 'postgres')
        .option("password", 'admin123')
        .option("driver", "org.postgresql.Driver").load())

job_clerk = (data
             .select("ename", "job","sal")
             .filter("job == 'CLERK'"))
job_clerk.createOrReplaceTempView("job_clerk")


In [29]:
spark.sql(
    """
    select eb.empno, e.ename, e.sal, e.deptno,
    case
        when eb.type = 1 then 0.1 * e.sal 
        when eb.type = 2 then 0.2 * e.sal 
        else 0.3* e.sal 
    end as bonus
    from emp as e left outer join emp_bonus as eb
    on(e.empno = eb.empno)
    where e.deptno = 10
    """
).show()

+-----+------+-------+------+--------+
|empno| ename|    sal|deptno|   bonus|
+-----+------+-------+------+--------+
| 7839|  KING|5000.00|    10|1500.000|
| 7934|MILLER|1300.00|    10| 130.000|
| 7934|MILLER|1300.00|    10| 260.000|
| 7782| CLARK|2450.00|    10| 245.000|
+-----+------+-------+------+--------+

