# Read / Write Operations to Cassandra using PySpark
___

### Initializing PySpark

In [1]:
import findspark
findspark.init()
import pyspark

### Imports

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

### Cassandra Dependancy Jars

In [3]:
spark_cassandra_jar = 'F:\\My_Practice\\Database_Connector\\Resources\\spark-cassandra-connector-2.3.0-s_2.11.jar'
jsr_jar = 'F:\\My_Practice\\Database_Connector\\Resources\\jsr166e-1.1.0.jar'

In [4]:
dependency_jars = spark_cassandra_jar + ', ' + jsr_jar

In [5]:
dependency_jars

'F:\\My_Practice\\Database_Connector\\Resources\\spark-cassandra-connector-2.3.0-s_2.11.jar, F:\\My_Practice\\Database_Connector\\Resources\\jsr166e-1.1.0.jar'

### Creating Spark Session

In [6]:
spark = (SparkSession.builder
           .config("spark.jars",dependency_jars)
           .appName('cassandra')
           .getOrCreate()
        )

### Checking Spark Configurations

In [7]:
SparkConf().getAll()

[('spark.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/spark-cassandra-connector-2.3.0-s_2.11.jar,file:///F:/My_Practice/Database_Connector/Resources/jsr166e-1.1.0.jar'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.repl.local.jars',
  'file:///F:/My_Practice/Database_Connector/Resources/spark-cassandra-connector-2.3.0-s_2.11.jar,file:///F:/My_Practice/Database_Connector/Resources/jsr166e-1.1.0.jar'),
 ('spark.app.name', 'cassandra'),
 ('spark.ui.showConsoleProgress', 'true')]

### Adding Cassandra Configurations

In [14]:
spark.conf.set("spark.cassandra.connection.host","192.168.2.19")
spark.conf.set("spark.cassandra.connection.port","9042")
spark.conf.set("spark.cassandra.connection.ssl.enabled",False)
spark.conf.set("spark.cassandra.auth.username","cassandra")
spark.conf.set("spark.cassandra.auth.password","cassandra")

### Reading File

In [22]:
filedf = (spark.read.format("csv")
                .options(header=True, inferSchema=True, delimiter='\t')
                .load('../Resources/SI_attr.tab')
         )

### Show

In [23]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|     Sales_Item_Id|Sales_Item_Description|  GIC|Product_Lifecycle_State|Purchasing_Group|Average Standard Production Cost|
+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                             0.0|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                             0.0|
+-------

### Schema

In [24]:
filedf.printSchema()

root
 |-- Sales_Item_Id: string (nullable = true)
 |-- Sales_Item_Description: string (nullable = true)
 |-- GIC: string (nullable = true)
 |-- Product_Lifecycle_State: string (nullable = true)
 |-- Purchasing_Group: string (nullable = true)
 |-- Average Standard Production Cost: double (nullable = true)



### Reading table from Cassandra

In [25]:
tableDF = (spark.read
              .format("org.apache.spark.sql.cassandra")
              .options(table="si_attrib", keyspace="cloudcdc")
              .load()
          )

### Checking Schema of the table

In [26]:
tableDF.printSchema()

root
 |-- id: long (nullable = true)
 |-- averagestandardproductioncost: string (nullable = true)
 |-- gic: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- sales_item_id: string (nullable = true)



### Changing Dataframe columns same as table columns

In [27]:
filedf = filedf.withColumnRenamed("Average Standard Production Cost","averagestandardproductioncost")

### Converting all file dataframe columns to lower case

In [28]:
filedf = filedf.toDF(*[c.lower() for c in filedf.columns])

### Generating sequential id column using zipWithIndex

In [29]:
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, StructType, StructField

In [30]:
schema = StructType(filedf.schema.fields + [StructField('id', IntegerType(), False)])

In [31]:
rddWithId = filedf.rdd.zipWithIndex()

In [32]:
filedf = spark.createDataFrame(rddWithId.map(lambda row: row[0] + Row(row[1] + 1)), schema)

### Show

In [33]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+-----------------------------+---+
|     sales_item_id|sales_item_description|  gic|product_lifecycle_state|purchasing_group|averagestandardproductioncost| id|
+------------------+----------------------+-----+-----------------------+----------------+-----------------------------+---+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                          0.0|  1|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                          0.0|  2|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                          0.0|  3|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                          0.0|  4|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                          0.0|  5|


### File Dataframe Schema

In [34]:
filedf.printSchema()

root
 |-- sales_item_id: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- gic: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- averagestandardproductioncost: double (nullable = true)
 |-- id: integer (nullable = false)



### Checking Table Count before writing

In [35]:
tableDF.count()

0

### Reducing File Count to 1000

In [36]:
filedf = filedf.limit(1000)

### Writing to Cassandra

In [39]:
(filedf.write
      .format("org.apache.spark.sql.cassandra")
      .options(table="si_attrib", keyspace="cloudcdc")
      .mode('append')
      .save()
)

### Closing Spark Session

In [40]:
spark.stop()