In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder.appName("Working with Number")
    .config("spark.jars", "postgresql-42.2.20.jar")
    .getOrCreate()
)

In [3]:
def connect_database_to_read_file(database, table, user, password, name_file):
    data = (
        spark.read.format("jdbc")
        .option("url", "jdbc:postgresql://localhost:5432/{}".format(database))
        .option("dbtable", table)
        .option("user", user)
        .option("password", password)
        .option("driver", "org.postgresql.Driver").load()
    )
    return data.createOrReplaceTempView(name_file)


In [4]:
emp_data = connect_database_to_read_file("sqla", "emp", "postgres", "admin123", "emp")


In [8]:
#Problem:
# You want to compute the avg value in column, either for all rows in a table or subset of rows
spark.sql(
    """
    select avg(coalesce(sal, 0)) as avg_sal
    from emp
    """
).show()

+-----------+
|    avg_sal|
+-----------+
|2007.500000|
+-----------+



In [9]:
# When searching for the lowest and highest salaries for all employees, simply use the functions MIN and MAX

spark.sql(
    """
    select min(sal) as min_sal, max(sal) as max_sal
    from emp
    """
).show()

+-------+-------+
|min_sal|max_sal|
+-------+-------+
| 880.00|5000.00|
+-------+-------+



In [12]:
# When searching for the lowest and highest salaries for each department, 
# use the functions MIN and MAX with the GROUP BY 
spark.sql(
    """
    select coalesce(deptno, 0), min(sal) as min_sal, max(sal) as max_sal
    from emp
    group by deptno
    """
).show()

+-----------------------------------------------------------------+-------+-------+
|coalesce(CAST(deptno AS DECIMAL(10,0)), CAST(0 AS DECIMAL(10,0)))|min_sal|max_sal|
+-----------------------------------------------------------------+-------+-------+
|                                                                0|   null|   null|
|                                                               10|1300.00|5000.00|
|                                                               30| 950.00|2850.00|
|                                                               20| 880.00|3300.00|
+-----------------------------------------------------------------+-------+-------+



In [18]:
# You want to compute the sum of all values, such as all employee salaries, in a column.

spark.sql(
    """
    select deptno, sum(sal) as total_for_dept
    from emp
    group by deptno
    """
).show()

+------+--------------+
|deptno|total_for_dept|
+------+--------------+
|  null|          null|
|    10|       8750.00|
|    30|       9400.00|
|    20|      11962.50|
+------+--------------+



In [20]:
# Counting Rows in a Table
# You want to count the number of rows in a table, or you wish to count the number of values in a column
spark.sql(
    """
    select count(*)
    from emp
    """
).show()

+--------+
|count(1)|
+--------+
|      15|
+--------+



In [22]:
spark.sql(
    """
    select deptno, count(*)
    from emp
    group by deptno
    """
).show()

+------+--------+
|deptno|count(1)|
+------+--------+
|  null|       1|
|    10|       3|
|    30|       6|
|    20|       5|
+------+--------+



In [24]:
spark.sql(
    """
    select count(*), count(deptno), count(comm), count('hello')
    from emp
    """
).show()

+--------+-------------+-----------+------------+
|count(1)|count(deptno)|count(comm)|count(hello)|
+--------+-------------+-----------+------------+
|      15|           14|          4|          15|
+--------+-------------+-----------+------------+



In [26]:
spark.sql(
    """
    select count(comm)
    from emp
    """
).show()

+-----------+
|count(comm)|
+-----------+
|          4|
+-----------+



In [35]:
# Solution
# As an example, the following solutions show how to compute a running total of salaries for all employees
spark.sql(
    """
    select ename,sal, sum(coalesce(sal,0)) over (order by sal) as running_total
    from emp
    order by 2
    """
).show()

+--------+-------+-------------+
|   ename|    sal|running_total|
+--------+-------+-------------+
|Jonathan|   null|         0.00|
|   SMITH| 880.00|       880.00|
|   JAMES| 950.00|      1830.00|
|   ADAMS|1210.00|      3040.00|
|    WARD|1250.00|      5540.00|
|  MARTIN|1250.00|      5540.00|
|  MILLER|1300.00|      6840.00|
|  TURNER|1500.00|      8340.00|
|   ALLEN|1600.00|      9940.00|
|   CLARK|2450.00|     12390.00|
|   BLAKE|2850.00|     15240.00|
|   JONES|3272.50|     18512.50|
|   SCOTT|3300.00|     25112.50|
|    FORD|3300.00|     25112.50|
|    KING|5000.00|     30112.50|
+--------+-------+-------------+



In [65]:
# Problem
# You want to compute a running product on a numeric column.
spark.sql(
    """
    select sal
    from (
        select sal as sal, count(*) as count
        from emp 
        where deptno = 20
        group by sal
        having count(*)
        order by count desc
    ) x
    limit 1
    """ 
).show()

+-------+
|    sal|
+-------+
|3300.00|
+-------+



In [67]:
# Problem 
# Determining the Percentage of a Total
# you want to determine what percentage of all salaries are the salaries in DEPTNO 10
spark.sql(
    """
    select ((sum(case when deptno = 10 then sal end) / sum(sal))*100) as pct
    from emp
    """
).show()

+--------------------+
|                 pct|
+--------------------+
|29.05770029057700291|
+--------------------+

