In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
import ConnectionConfig as cc
cc.setupEnvironment()

### Config stuff

### Connection properties
ConnectionConfig.py (cc) is created and imported to simplify the database connection process.
Consult the file to get more insights.

### Session setup
"spark.driver.extraClassPath" is added. This is needed to include the necessary jars when running the sparkJobs.

In [3]:
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("DBConnectionTest") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .master("local[4]")
#This one must be added to be able to query a jdbc source
#extra_packages = ["org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2","com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"]
extra_packages = ["org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2","org.postgresql:postgresql:42.7.4"]

builder = configure_spark_with_delta_pip(builder, extra_packages=extra_packages)

spark = builder.getOrCreate()
builder.getOrCreate()
spark.sparkContext.setLogLevel("DEBUG")

### Reading a JDBC table
Read a table from sqlServer connection

#### Using the ConnectionConfig (cc) to make things easy
cc can make a connection url based on a connection profile in config.ini. To do this, first set the name of the connection.

#### Partitioning
As Spark is build to work in parallel reading from the database can also be done in parallel. In this case we define 4 partitions. Spark has to know how to split the data for every partition. Therefore you have to provide a partition column and a lower and upperbound. In this case the  on of the 4 queries that Spark will fire looks like "select * from dbo.sales where Order_ID <= 500 and Order_id > 250"

In [4]:
cc.set_connectionProfile("tutorial_op")
print(cc.create_jdbc())
sales_df = spark.read \
    .format("jdbc") \
    .option("driver" , "org.postgresql.Driver") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "sales") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "Order_ID") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1001) \
    .load()
sales_df.show(1000)

jdbc:postgresql://localhost:5432/tutorial_op?user=postgres&password=DB3DB3DB3&ssl=false
+--------+-------------------+--------------------+----------+----------+
|order_id|         order_date|       customer_name|salesrepid|    amount|
+--------+-------------------+--------------------+----------+----------+
|       1|2010-10-13 00:00:00|  Muhammed MacIntyre|         1| 851804379|
|       2|2012-10-01 00:00:00|        Barry French|         1| 683057055|
|       3|2011-07-10 00:00:00|       Clay Rozendal|         1|1732115679|
|       4|2010-08-28 00:00:00|      Carlos Soltero|         1|1275042249|
|       5|2011-06-17 00:00:00|         Carl Jackso|         1| 694153767|
|       6|2011-03-24 00:00:00|      Monica Federle|         1|1959464599|
|       7|2010-02-26 00:00:00|     Dorothy Badders|         1|1170677605|
|       8|2010-11-23 00:00:00|     Neola Schneider|         1|1588502393|
|       9|2012-06-08 00:00:00|         Carlos Daly|         1|1173163372|
|      10|2012-08-04 00:

In [5]:
spark.stop()