# Read / Write Operations to Cassandra using Spark with Scala
___

### Importing Packages

In [1]:
import org.apache.spark.sql.{SparkSession,SaveMode}

### Adding Cassandra Dependancy Jars

In [2]:
%AddJar https://dl.bintray.com/spark-packages/maven/datastax/spark-cassandra-connector/2.3.0-s_2.11/spark-cassandra-connector-2.3.0-s_2.11.jar
%AddJar https://repo1.maven.org/maven2/com/twitter/jsr166e/1.1.0/jsr166e-1.1.0.jar

Starting download from https://dl.bintray.com/spark-packages/maven/datastax/spark-cassandra-connector/2.3.0-s_2.11/spark-cassandra-connector-2.3.0-s_2.11.jar
Finished download of spark-cassandra-connector-2.3.0-s_2.11.jar
Starting download from https://repo1.maven.org/maven2/com/twitter/jsr166e/1.1.0/jsr166e-1.1.0.jar
Finished download of jsr166e-1.1.0.jar


### Creating Spark Session

In [3]:
val spark = SparkSession.builder.appName("cassandra").getOrCreate()

spark = org.apache.spark.sql.SparkSession@3f957ed8


org.apache.spark.sql.SparkSession@3f957ed8

### Adding Cassandra Configurations

In [4]:
spark.conf.set("spark.cassandra.connection.host","192.168.2.19")
spark.conf.set("spark.cassandra.connection.port","9042")
spark.conf.set("spark.cassandra.connection.ssl.enabled","false")
spark.conf.set("spark.cassandra.auth.username","cassandra")
spark.conf.set("spark.cassandra.auth.password","cassandra")

### Verifying Configurations

In [5]:
spark.conf.getAll

Map(spark.driver.host -> 0e4cb1730162, spark.cassandra.connection.ssl.enabled -> false, spark.driver.port -> 36216, spark.repl.class.uri -> spark://0e4cb1730162:36216/classes, spark.jars -> file:/opt/conda/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar, spark.repl.class.outputDir -> /tmp/spark-2eef64e5-59dc-4d5d-8343-4cb8fdae2ee7/repl-926b92e2-2813-4b85-a451-909c636f5e66, spark.app.name -> cassandra, spark.cassandra.auth.username -> cassandra, spark.executor.id -> driver, spark.cassandra.connection.port -> 9042, spark.driver.extraJavaOptions -> -Dlog4j.logLevel=info, spark.submit.deployMode -> client, spark.cassandra.auth.password -> cassandra, spark.master -> local[*], spark.app.id -> local-1581521294055, spark.cassandra.conn...

### Reading File

In [7]:
var filedf = spark.read.format("csv")
                   .options(Map(("header","true"),("inferSchema","true"),("delimiter","\t")))
                   .load("../Resources/SI_attr.tab")

filedf = [Sales_Item_Id: string, Sales_Item_Description: string ... 4 more fields]


[Sales_Item_Id: string, Sales_Item_Description: string ... 4 more fields]

### Show

In [8]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|     Sales_Item_Id|Sales_Item_Description|  GIC|Product_Lifecycle_State|Purchasing_Group|Average Standard Production Cost|
+------------------+----------------------+-----+-----------------------+----------------+--------------------------------+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                             0.0|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                             0.0|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                             0.0|
+-------

### Schema

In [9]:
filedf.printSchema

root
 |-- Sales_Item_Id: string (nullable = true)
 |-- Sales_Item_Description: string (nullable = true)
 |-- GIC: string (nullable = true)
 |-- Product_Lifecycle_State: string (nullable = true)
 |-- Purchasing_Group: string (nullable = true)
 |-- Average Standard Production Cost: double (nullable = true)



### Reading table from Cassandra

In [10]:
val tableDF = spark.read
      .format("org.apache.spark.sql.cassandra")
      .options(Map( "table" -> "si_attrib", "keyspace" -> "cloudcdc"))
      .load()

tableDF = [id: bigint, averagestandardproductioncost: string ... 5 more fields]


[id: bigint, averagestandardproductioncost: string ... 5 more fields]

### Checking Schema of the table

In [11]:
tableDF.printSchema

root
 |-- id: long (nullable = true)
 |-- averagestandardproductioncost: string (nullable = true)
 |-- gic: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- sales_item_id: string (nullable = true)



### Changing Dataframe columns same as table columns

In [12]:
filedf = filedf.withColumnRenamed("Average Standard Production Cost","averagestandardproductioncost")

filedf = [Sales_Item_Id: string, Sales_Item_Description: string ... 4 more fields]


[Sales_Item_Id: string, Sales_Item_Description: string ... 4 more fields]

### Converting all file dataframe columns to lower case

In [13]:
filedf = filedf.toDF(filedf.columns map (_.toLowerCase): _*)

filedf = [sales_item_id: string, sales_item_description: string ... 4 more fields]


[sales_item_id: string, sales_item_description: string ... 4 more fields]

### Generating sequential id column using Window Functions

In [14]:
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

In [15]:
filedf = filedf.withColumn("id", lit(1))

filedf = [sales_item_id: string, sales_item_description: string ... 5 more fields]


[sales_item_id: string, sales_item_description: string ... 5 more fields]

In [16]:
var w = Window.orderBy("id")

w = org.apache.spark.sql.expressions.WindowSpec@5afdf4b3


org.apache.spark.sql.expressions.WindowSpec@5afdf4b3

In [17]:
filedf = filedf.withColumn("id",row_number.over(w))

filedf = [sales_item_id: string, sales_item_description: string ... 5 more fields]


[sales_item_id: string, sales_item_description: string ... 5 more fields]

### Show

In [18]:
filedf.show(5)

+------------------+----------------------+-----+-----------------------+----------------+-----------------------------+---+
|     sales_item_id|sales_item_description|  gic|product_lifecycle_state|purchasing_group|averagestandardproductioncost| id|
+------------------+----------------------+-----+-----------------------+----------------+-----------------------------+---+
|    IE2:15HP-RFU-7|           7GHZ HP ODU|07599|               OBSOLETE|            null|                          0.0|  1|
|    IE2:15HP-RFU-8|  1500HP RF UNIT, F...|07599|               OBSOLETE|            null|                          0.0|  2|
|IE2:15HP-SHORT-112|    1500HP SHORT, FGHZ|07599|               OBSOLETE|            null|                          0.0|  3|
|IE2:15HP-SHORT-137|     1500P SHORT, FGHZ|07599|               OBSOLETE|            null|                          0.0|  4|
| IE2:15HP-TERM-112|  1500HP 50 OHM TER...|09208|               OBSOLETE|            null|                          0.0|  5|


### File Dataframe Schema

In [19]:
filedf.printSchema

root
 |-- sales_item_id: string (nullable = true)
 |-- sales_item_description: string (nullable = true)
 |-- gic: string (nullable = true)
 |-- product_lifecycle_state: string (nullable = true)
 |-- purchasing_group: string (nullable = true)
 |-- averagestandardproductioncost: double (nullable = true)
 |-- id: integer (nullable = true)



### Checking Table Count before writing

In [20]:
tableDF.count()

0

### Writing to Cassandra

In [21]:
filedf.write
      .format("org.apache.spark.sql.cassandra")
      .options(Map( "table" -> "si_attrib", "keyspace" -> "cloudcdc"))
      .mode(SaveMode.Append)
      .save()

### Checking Table Count after writing

In [22]:
val tableDF = spark.read
      .format("org.apache.spark.sql.cassandra")
      .options(Map( "table" -> "si_attrib", "keyspace" -> "cloudcdc"))
      .load()

tableDF = [id: bigint, averagestandardproductioncost: string ... 5 more fields]


[id: bigint, averagestandardproductioncost: string ... 5 more fields]

In [23]:
tableDF.count()

361560

### Closing Spark Session

In [24]:
spark.stop()