In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = (
    SparkSession.builder.appName("Spark_PostgreSQL")
    .config("spark.jars", "postgresql-42.2.20.jar")
    .getOrCreate())


In [5]:
spark = (SparkSession
         .builder
         .appName("Connect PostgreSQL with pySpark")
         .config("spark.jars", "postgresql-42.2.20.jar")
         .getOrCreate())
# Read file
emp_data = (spark
  .read
  .format("jdbc") 
  .option("url", "jdbc:postgresql://localhost:5432/sqla")
  .option("dbtable", "emp")
  .option("user", "postgres")
  .option("password", "admin123").option("driver", "org.postgresql.Driver").load())


In [24]:
emp_data.createOrReplaceTempView("emp")


In [26]:
spark.sql("select * from emp limit 5").show()

+-----+------+--------+----+----------+-------+-------+------+
|empno| ename|     job| mgr|  hiredate|    sal|   comm|deptno|
+-----+------+--------+----+----------+-------+-------+------+
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7654|MARTIN|SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7698| BLAKE| MANAGER|7839|1981-05-01|2850.00|   null|    30|
| 7782| CLARK| MANAGER|7839|1981-06-09|2450.00|   null|    10|
+-----+------+--------+----+----------+-------+-------+------+



In [28]:
spark.sql(
    """
    select *
    from emp 
    where deptno = 10
    """
).show()

+-----+------+---------+----+----------+-------+----+------+
|empno| ename|      job| mgr|  hiredate|    sal|comm|deptno|
+-----+------+---------+----+----------+-------+----+------+
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|null|    10|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.00|null|    10|
+-----+------+---------+----+----------+-------+----+------+



In [31]:
# Use the WHERE clause along with the OR and AND clauses. For example, if you would like to find all 
# the employees in department 10, along with any employees who earn a commission, along with any employees 
# in department 20 who earn at most $2000:
spark.sql(
    """
    select * 
    from emp
    where deptno = 10 
    or comm is not null 
    or sal <= 2000 and deptno = 20  
    """
).show()

+-----+------+---------+----+----------+-------+-------+------+
|empno| ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+------+---------+----+----------+-------+-------+------+
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300.00|   null|    10|
| 7369| SMITH|    CLERK|7902|1980-12-17| 880.00|   null|    20|
| 7876| ADAMS|    CLERK|7788|1983-01-12|1210.00|   null|    20|
+-----+------+---------+----+----------+-------+-------+------+



In [34]:
# You have a table and want to see values for specific columns rather than for all the columns.
spark.sql(
    """
    select ename, deptno, sal
    from emp
    """
).show(n = 4)

+------+------+-------+
| ename|deptno|    sal|
+------+------+-------+
| ALLEN|    30|1600.00|
|  WARD|    30|1250.00|
|MARTIN|    30|1250.00|
| BLAKE|    30|2850.00|
+------+------+-------+
only showing top 4 rows



In [38]:
# To change the names of your query results use the AS keyword in the form: original_name AS new_name.
# Some databases do not require AS, but all accept it:
spark.sql(
    """
    select sal as salary, comm as commission
    from emp 
    """
).show(n = 5)

+-------+----------+
| salary|commission|
+-------+----------+
|1600.00|    300.00|
|1250.00|    500.00|
|1250.00|   1400.00|
|2850.00|      null|
|2450.00|      null|
+-------+----------+
only showing top 5 rows



In [45]:
# You have used aliases to provide more meaningful column names for your result set and would like to 
# exclude some of the rows using the WHERE clause

spark.sql(
    """
    select * 
    from(
    select sal as salary, comm as commission
    from emp) x
    where salary < 5000
    """
).show(n = 5)

+-------+----------+
| salary|commission|
+-------+----------+
|1600.00|    300.00|
|1250.00|    500.00|
|1250.00|   1400.00|
|2850.00|      null|
|2450.00|      null|
+-------+----------+
only showing top 5 rows



In [48]:
# You want to return values in multiple columns as one column. For example, 
# you would like to produce this result set from a query against the EMP table:

spark.sql(
    """
    select ename || 'WORK AS A' || job
    from emp
    where deptno = 10
    """
).show()

+-------------------------------------+
|concat(concat(ename, WORK AS A), job)|
+-------------------------------------+
|                 CLARKWORK AS AMAN...|
|                 KINGWORK AS APRES...|
|                 MILLERWORK AS ACLERK|
+-------------------------------------+



In [51]:
# if an employee is paid $2000 or less, a message of “UNDERPAID” is returned
#  if an employee is paid $4000 or more, a message of “OVERPAID” is returned, 
spark.sql(
    """
    select ename, sal, 
    case 
        when sal <= 2000 then 'UNDERPAID'
        when sal >= 4000 then 'OVERPAID'
    else 'OK'
    end as STATUS
    from emp
    """
).show()

+--------+-------+---------+
|   ename|    sal|   STATUS|
+--------+-------+---------+
|   ALLEN|1600.00|UNDERPAID|
|    WARD|1250.00|UNDERPAID|
|  MARTIN|1250.00|UNDERPAID|
|   BLAKE|2850.00|       OK|
|   CLARK|2450.00|       OK|
|    KING|5000.00| OVERPAID|
|  TURNER|1500.00|UNDERPAID|
|   JAMES| 950.00|UNDERPAID|
|  MILLER|1300.00|UNDERPAID|
|Jonathan|   null|       OK|
|   SMITH| 880.00|UNDERPAID|
|   JONES|3272.50|       OK|
|   SCOTT|3300.00|       OK|
|   ADAMS|1210.00|UNDERPAID|
|    FORD|3300.00|       OK|
+--------+-------+---------+



In [53]:
# You have rows that contain nulls and would like to return non-null values in place of those nulls.
spark.sql(
    """
    select coalesce(comm, 0)
    from emp
    """
).show()

+---------------------------------------------------------------+
|coalesce(CAST(comm AS DECIMAL(12,2)), CAST(0 AS DECIMAL(12,2)))|
+---------------------------------------------------------------+
|                                                         300.00|
|                                                         500.00|
|                                                        1400.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|                                                           0.00|
|         

In [55]:
# Use the LIKE operator in conjunction with the SQL wildcard operator (”%”):
spark.sql(
    """
    select ename, job
    from emp 
    where deptno in (10, 20)
    and (ename like '%I%' or job like "%ER")
    """
).show()

+------+---------+
| ename|      job|
+------+---------+
| CLARK|  MANAGER|
|  KING|PRESIDENT|
|MILLER|    CLERK|
| SMITH|    CLERK|
| JONES|  MANAGER|
+------+---------+

