# Read / Write Operations to Postgres using PySpark
___

### Initializing PySpark

In [1]:
import findspark
findspark.init()
import pyspark

### Imports

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

### Postgres Dependancy Jar

In [3]:
postgres_jar = 'F:\\My_Practice\\Database_Connector\\Resources\postgresql-42.2.9.jar'

### Creating Spark Session

In [4]:
spark = (SparkSession.builder
           .config("spark.jars",postgres_jar)
           .appName('postgres')
           .getOrCreate()
        )

### Checking Spark Configurations

In [5]:
SparkConf().getAll()

[('spark.repl.local.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/postgresql-42.2.9.jar'),
 ('spark.app.name', 'oracle'),
 ('spark.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/postgresql-42.2.9.jar'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### Postgres Server Credentials

In [6]:
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://192.168.2.20:5434/aline"
database = "aline"
dbtable = "sli_attributes"
user = "aline"
password = "Aline-12#"

### Reading File

In [21]:
filedf = (spark.read.format("csv")
                .options(header=True, inferSchema=True, delimiter='\t')
                .load('../Resources/SI_attr.tab')
         )

### Show

In [22]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|     Sales_Item_Id|Sales_Item_Description|  GIC|Product_Lifecycle_State|Purchasing_Group|Average Standard Production Cost|
+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                             0.0|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                             0.0|
+-------

### Schema

In [23]:
filedf.printSchema()

root
 |-- Sales_Item_Id: string (nullable = true)
 |-- Sales_Item_Description: string (nullable = true)
 |-- GIC: string (nullable = true)
 |-- Product_Lifecycle_State: string (nullable = true)
 |-- Purchasing_Group: string (nullable = true)
 |-- Average Standard Production Cost: double (nullable = true)



### Reading table from Postgres

In [24]:
tableDF = (spark.read
            .format("jdbc")
            .options(url=url, driver=driver, dbtable=dbtable, user=user, password=password)
            .load()
          )

### Checking Schema of the table

In [25]:
tableDF.printSchema()

root
 |-- average_standard_production_cost: double (nullable = true)
 |-- gic: string (nullable = true)
 |-- sales_item_id: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)



### Changing Dataframe columns same as table columns

In [26]:
filedf = filedf.withColumnRenamed("Average Standard Production Cost","average_standard_production_cost")

### Converting all file dataframe columns to lower case

In [27]:
filedf = filedf.toDF(*[c.lower() for c in filedf.columns])

### Checking Schema after Changes

In [28]:
filedf.printSchema()

root
 |-- sales_item_id: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- gic: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- average_standard_production_cost: double (nullable = true)



### Checking Table Count before writing

In [29]:
tableDF.count()

0

### Writing to Postgres

In [30]:
(filedf.write
  .format("jdbc")
  .options(url=url, dbtable=dbtable, user=user, password=password, driver=driver)
  .mode('append')
  .save()
)

### Closing Spark Session

In [31]:
spark.stop()