# Read / Write Operations to SAP HANA using PySpark
___

### Initializing PySpark

In [1]:
import findspark
findspark.init()
import pyspark

### Imports

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

### SAP HANA Dependancy Jar

In [3]:
saphana_jar = 'F:\\My_Practice\\Database_Connector\\Resources\\ngdbc-2.4.63.jar'

### Creating Spark Session

In [4]:
spark = (SparkSession.builder
           .config("spark.jars",saphana_jar)
           .appName('sap-hana')
           .getOrCreate()
        )

### Checking Spark Configurations

In [5]:
SparkConf().getAll()

[('spark.app.name', 'sap-hana'),
 ('spark.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/ngdbc-2.4.63.jar'),
 ('spark.master', 'local[*]'),
 ('spark.repl.local.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/ngdbc-2.4.63.jar'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### SAP HANA Server Credentials

In [6]:
driver = "com.sap.db.jdbc.Driver"
url = "jdbc:sap://192.168.2.23:39017"
dbtable = "Lima.SLI_ATTRIBUTES"
user = "system"
password = "Aline-12#"
#database = "LIMA"

### Reading File

In [7]:
filedf = (spark.read.format("csv")
                .options(header=True, inferSchema=True, delimiter='\t')
                .load('../Resources/SI_attr.tab')
         )

### Show

In [8]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|     Sales_Item_Id|Sales_Item_Description|  GIC|Product_Lifecycle_State|Purchasing_Group|Average Standard Production Cost|
+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                             0.0|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                             0.0|
+-------

### Schema

In [9]:
filedf.printSchema()

root
 |-- Sales_Item_Id: string (nullable = true)
 |-- Sales_Item_Description: string (nullable = true)
 |-- GIC: string (nullable = true)
 |-- Product_Lifecycle_State: string (nullable = true)
 |-- Purchasing_Group: string (nullable = true)
 |-- Average Standard Production Cost: double (nullable = true)



### Reading table from SAP HANA

In [10]:
tableDF = (spark.read
            .format("jdbc")
            .options(url=url, driver=driver, dbtable=dbtable, user=user, password=password)
            .load()
          )

### Checking Schema of the table

In [11]:
tableDF.printSchema()

root
 |-- AVERAGE_STANDARD_PRODUCTION_COST: double (nullable = true)
 |-- GIC: string (nullable = true)
 |-- SALES_ITEM_ID: string (nullable = true)
 |-- PURCHASING_GROUP: string (nullable = true)
 |-- SALES_ITEM_DESCRIPTION: string (nullable = true)
 |-- PRODUCT_LIFECYCLE_STATE: string (nullable = true)



### Changing Dataframe columns same as table columns

In [12]:
filedf = filedf.withColumnRenamed("Average Standard Production Cost","AVERAGE_STANDARD_PRODUCTION_COST")

### Converting all file dataframe columns to upper case

In [13]:
filedf = filedf.toDF(*[c.upper() for c in filedf.columns])

### File Dataframe Schema

In [14]:
filedf.printSchema()

root
 |-- SALES_ITEM_ID: string (nullable = true)
 |-- SALES_ITEM_DESCRIPTION: string (nullable = true)
 |-- GIC: string (nullable = true)
 |-- PRODUCT_LIFECYCLE_STATE: string (nullable = true)
 |-- PURCHASING_GROUP: string (nullable = true)
 |-- AVERAGE_STANDARD_PRODUCTION_COST: double (nullable = true)



### Checking Table Count before writing

In [15]:
tableDF.count()

0

### Reducing File Count to 1000

In [16]:
filedf = filedf.limit(1000)

### Writing to SAP HANA

In [17]:
(filedf.write
  .format("jdbc")
  .options(url=url, dbtable=dbtable, user=user, password=password, driver=driver)
  .mode('append')
  .save()
)

### Closing Spark Session

In [18]:
spark.stop()