In [3]:
from pyspark.sql import SparkSession


In [4]:
# Python connect Postgre pd4
import psycopg2
conn = psycopg2.connect(host="localhost", port = 5432, database="sqla", user="postgres", password="admin123")

cur = conn.cursor()
cursor = conn.cursor()

#Executing an MYSQL function using the execute() method
cursor.execute("select * from emp")

# Fetch a single row using fetchone() method.
data = cursor.fetchone()
print("Connection established to: ",data)

#Closing the connection
conn.close()



Connection established to:  (Decimal('7499'), 'ALLEN', 'SALESMAN', Decimal('7698'), datetime.date(1981, 2, 20), Decimal('1600.00'), Decimal('300.00'), Decimal('30'))


In [9]:
spark = (SparkSession
         .builder
         .appName("Connect PostgreSQL with pySpark")
         .config("spark.jars", "postgresql-42.2.20.jar")
         .getOrCreate())
# Read file
jdbcDF1 = (spark
  .read
  .format("jdbc") 
  .option("url", "jdbc:postgresql://localhost:5432/sqla")
  .option("dbtable", "emp")
  .option("user", "postgres")
  .option("password", "admin123").option("driver", "org.postgresql.Driver").load())


In [10]:
jdbcDF1.show()

+-----+--------+---------+----+----------+-------+-------+------+
|empno|   ename|      job| mgr|  hiredate|    sal|   comm|deptno|
+-----+--------+---------+----+----------+-------+-------+------+
| 7499|   ALLEN| SALESMAN|7698|1981-02-20|1600.00| 300.00|    30|
| 7521|    WARD| SALESMAN|7698|1981-02-22|1250.00| 500.00|    30|
| 7654|  MARTIN| SALESMAN|7698|1981-09-28|1250.00|1400.00|    30|
| 7698|   BLAKE|  MANAGER|7839|1981-05-01|2850.00|   null|    30|
| 7782|   CLARK|  MANAGER|7839|1981-06-09|2450.00|   null|    10|
| 7839|    KING|PRESIDENT|null|1981-11-17|5000.00|   null|    10|
| 7844|  TURNER| SALESMAN|7698|1981-09-08|1500.00|   0.00|    30|
| 7900|   JAMES|    CLERK|7698|1981-12-03| 950.00|   null|    30|
| 7934|  MILLER|    CLERK|7782|1982-01-23|1300.00|   null|    10|
|    1|Jonathan|   Editor|null|      null|   null|   null|  null|
| 7369|   SMITH|    CLERK|7902|1980-12-17| 880.00|   null|    20|
| 7566|   JONES|  MANAGER|7839|1981-04-02|3272.50|   null|    20|
| 7788|   

In [5]:
# Write File 

# Saving data to a JDBC source using save 
(jdbcDF
  .write 
  .format("jdbc") 
  .option("url", "jdbc:mysql://[DBSERVER]:3306/[DATABASE]")
  .option("driver", "com.mysql.jdbc.Driver") 
  .option("dbtable", "[TABLENAME]") 
  .option("user", "[USERNAME]")
  .option("password", "[PASSWORD]")
  .save())

1

In [6]:
# In Python
from pyspark.sql.types import *
schema = StructType([StructField("celsius", ArrayType(IntegerType()))])

t_list = [[35, 36, 32, 30, 40, 42, 38]], [[31, 32, 34, 55, 56]]
t_c = spark.createDataFrame(t_list, schema)
t_c.createOrReplaceTempView("tC")

# Show the DataFrame
t_c.show()

+--------------------+
|             celsius|
+--------------------+
|[35, 36, 32, 30, ...|
|[31, 32, 34, 55, 56]|
+--------------------+



In [14]:
# transform(): transform(array<T>, function<T, U>): array<U>
spark.sql("""
SELECT celsius, 
 transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit 
  FROM tC
""").show()

+--------------------+--------------------+
|             celsius|          fahrenheit|
+--------------------+--------------------+
|[35, 36, 32, 30, ...|[95, 96, 89, 86, ...|
|[31, 32, 34, 55, 56]|[87, 89, 93, 131,...|
+--------------------+--------------------+



In [15]:
# filter(array<T>, function<T, Boolean>): array<T>
spark.sql("""
SELECT celsius, 
 filter(celsius, t -> t > 38) as high 
  FROM tC
""").show()

+--------------------+--------+
|             celsius|    high|
+--------------------+--------+
|[35, 36, 32, 30, ...|[40, 42]|
|[31, 32, 34, 55, 56]|[55, 56]|
+--------------------+--------+



In [17]:
# exists(array<T>, function<T, V, Boolean>): Boolean
spark.sql(
    """ select celsius, exists(celsius, t -> t =38) as threshold from tC"""
).show()

+--------------------+---------+
|             celsius|threshold|
+--------------------+---------+
|[35, 36, 32, 30, ...|     true|
|[31, 32, 34, 55, 56]|    false|
+--------------------+---------+



In [25]:
from pyspark.sql.functions import expr
# !ls
trip_delay_file_path = 'Data/departuredelays.csv'
# airport_file_path = "Data/flights/airport-codes-na.txt"
departure_delays = (
    spark.read.format("csv").options(header = True).load(trip_delay_file_path)
)
departure_delays = (departure_delays
    .withColumn("delay", expr("CAST(delay as INT) as delay"))
    .withColumn("distance", expr("CAST(distance as INT) as distance"))
)
departure_delays.createOrReplaceTempView("departure_delays")

In [26]:
foo = (
    departure_delays.filter(expr("""origin == 'SEA' and destination == 'SFO' and 
    date like '01010%' and delay > 0"""))
)
foo.createOrReplaceTempView("foo")

In [27]:
spark.sql("select * from foo LIMIT 10").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [30]:
# union 
bar = departure_delays.union(foo)
bar.createOrReplaceTempView("bar")
bar.filter(expr("""origin == 'SEA' AND destination == 'SFO'
AND date LIKE '01010%' AND delay > 0""")).show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [31]:
spark.sql("""
SELECT * 
  FROM bar 
 WHERE origin = 'SEA' 
   AND destination = 'SFO' 
   AND date LIKE '01010%' 
   AND delay > 0
""").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



In [None]:
# Joins the options being inner, cross, outer, full, full_outer, left, left_outer, right, 
# right_outer, left_semi, and left_anti
# Windowing
spark.sql(
    """
    select origin, destination, TotalDelays, dense_rank()
    over(parition by origin order by TotalDelays DESC) as rank
    from depature_delays
    """
)