### Imports

In [1]:
import org.apache.spark.sql.{SparkSession,Dataset}

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1579452082591)
SparkSession available as 'spark'


import org.apache.spark.sql.{SparkSession, Dataset}


### Creating Spark Session

In [2]:
val spark = SparkSession.builder.getOrCreate()

2020-01-19 22:11:37 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5f1ca5fc


### Reading Data

In [3]:
val dfTags = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("..\\Resources\\question_tags_10K.csv")
    .toDF("id", "tag")

dfTags: org.apache.spark.sql.DataFrame = [id: int, tag: string]


In [4]:
dfTags.show(5)

+---+---------------+
| id|            tag|
+---+---------------+
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
|  4|        decimal|
+---+---------------+
only showing top 5 rows



In [5]:
val dfQuestionsCSV = spark
    .read
    .option("header", false)
    .option("inferSchema", true)
    .option("dateFormat","yyyy-MM-dd HH:mm:ss")
    .csv("..\\resources\\questions_10K.csv")
    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")

dfQuestionsCSV: org.apache.spark.sql.DataFrame = [id: string, creation_date: string ... 5 more fields]


In [6]:
dfQuestionsCSV.show(5)

+---+--------------------+--------------------+--------------------+-----+------------+------------+
| id|       creation_date|         closed_date|       deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+--------------------+-----+------------+------------+
| Id|        CreationDate|          ClosedDate|        DeletionDate|Score| OwnerUserId| AnswerCount|
|  1|2008-07-31T21:26:37Z|                  NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-07-31T21:42:52Z|                  NA|                  NA|  472|           8|          13|
|  6|2008-07-31T22:08:08Z|                  NA|                  NA|  210|           9|           5|
|  8|2008-07-31T23:33:19Z|2013-06-03T04:00:25Z|2015-02-11T08:26:40Z|   42|          NA|           8|
+---+--------------------+--------------------+--------------------+-----+------------+------------+
only showing top 5 rows



In [7]:
val dfQuestions = dfQuestionsCSV
    .filter("score > 400 and score < 410")
    .join(dfTags, "id")
    .select("owner_userid", "tag", "creation_date", "score")
    .toDF()

dfQuestions: org.apache.spark.sql.DataFrame = [owner_userid: string, tag: string ... 2 more fields]


In [8]:
dfQuestions.show(5)

+------------+---------+--------------------+-----+
|owner_userid|      tag|       creation_date|score|
+------------+---------+--------------------+-----+
|         131|   xdebug|2008-08-03T23:18:21Z|  405|
|         131| phpstorm|2008-08-03T23:18:21Z|  405|
|         131|debugging|2008-08-03T23:18:21Z|  405|
|         131|  eclipse|2008-08-03T23:18:21Z|  405|
|         131|      php|2008-08-03T23:18:21Z|  405|
+------------+---------+--------------------+-----+
only showing top 5 rows



### Convert DataFrame row to Scala case class

In [9]:
case class Tag(id:Int, tag:String)

defined class Tag


In [10]:
import spark.implicits._

import spark.implicits._


In [11]:
val dfTagsOfTag: Dataset[Tag] = dfTags.as[Tag]

dfTagsOfTag: org.apache.spark.sql.Dataset[Tag] = [id: int, tag: string]


In [12]:
dfTagsOfTag.show(3)

+---+--------+
| id|     tag|
+---+--------+
|  1|    data|
|  4|      c#|
|  4|winforms|
+---+--------+
only showing top 3 rows



In [13]:
dfTagsOfTag.take(5).foreach(t => println(s"id = ${t.id}, tag=${t.tag}"))

id = 1, tag=data
id = 4, tag=c#
id = 4, tag=winforms
id = 4, tag=type-conversion
id = 4, tag=decimal


### DataFrame row to Scala case class using map()

In [14]:
case class Question(owner_userid: Int, tag: String, creationDate: java.sql.Timestamp, score: Int)

defined class Question


##### create a function which will parse each element in the row

In [15]:
def toQuestion(row: org.apache.spark.sql.Row): Question = {
    // to normalize our owner_userid data
    val IntOf: String => Option[Int] = _ match {
      case s if s == "NA" => None
      case s => Some(s.toInt)
    }

    import java.time._
    val DateOf: String => java.sql.Timestamp = _ match {
      case s => java.sql.Timestamp.valueOf(ZonedDateTime.parse(s).toLocalDateTime)
    }

    Question (
      owner_userid = IntOf(row.getString(0)).getOrElse(-1),
      tag = row.getString(1),
      creationDate = DateOf(row.getString(2)),
      score = row.getString(3).toInt
    )
  }

toQuestion: (row: org.apache.spark.sql.Row)Question


##### Now let's convert each row into a Question case class

In [16]:
import spark.implicits._

import spark.implicits._


In [17]:
val dfOfQuestion: Dataset[Question] = dfQuestions.map(row => toQuestion(row))

dfOfQuestion: org.apache.spark.sql.Dataset[Question] = [owner_userid: int, tag: string ... 2 more fields]


### Create DataFrame from collection

In [19]:
val seqTags = Seq(
  1 -> "so_java",
  1 -> "so_jsp",
  2 -> "so_golang",
  3 -> "so_scala",
  3 -> "so_akka"
)

seqTags: Seq[(Int, String)] = List((1,so_java), (1,so_jsp), (2,so_golang), (3,so_scala), (3,so_akka))


In [20]:
import spark.implicits._

import spark.implicits._


In [21]:
val dfMoreTags = seqTags.toDF("id", "tag")

dfMoreTags: org.apache.spark.sql.DataFrame = [id: int, tag: string]


In [22]:
dfMoreTags.show()

+---+---------+
| id|      tag|
+---+---------+
|  1|  so_java|
|  1|   so_jsp|
|  2|so_golang|
|  3| so_scala|
|  3|  so_akka|
+---+---------+



### DataFrame Union

In [23]:
dfTags.show(2)

+---+----+
| id| tag|
+---+----+
|  1|data|
|  4|  c#|
+---+----+
only showing top 2 rows



In [24]:
dfMoreTags.show(2)

+---+-------+
| id|    tag|
+---+-------+
|  1|so_java|
|  1| so_jsp|
+---+-------+
only showing top 2 rows



In [26]:
val dfUnionOfTags = dfTags.union(dfMoreTags).filter("id in (1,3)")

dfUnionOfTags: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, tag: string]


In [27]:
dfUnionOfTags.show(5)

+---+--------+
| id|     tag|
+---+--------+
|  1|    data|
|  1| so_java|
|  1|  so_jsp|
|  3|so_scala|
|  3| so_akka|
+---+--------+



### DataFrame Intersection

In [28]:
val dfIntersectionTags = dfMoreTags.intersect(dfUnionOfTags)

dfIntersectionTags: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, tag: string]


In [29]:
dfIntersectionTags.show(5)

+---+--------+
| id|     tag|
+---+--------+
|  3|so_scala|
|  3| so_akka|
|  1| so_java|
|  1|  so_jsp|
+---+--------+



### Append column to DataFrame using withColumn()

In [30]:
val dfSplitColumn = dfMoreTags.withColumn("tmp", split($"tag", "_"))
                              .select($"id",$"tag",$"tmp".getItem(0).as("so_prefix"),$"tmp".getItem(1).as("so_tag"))
                              .drop("tmp")

dfSplitColumn: org.apache.spark.sql.DataFrame = [id: int, tag: string ... 2 more fields]


In [31]:
dfSplitColumn.show(5)

+---+---------+---------+------+
| id|      tag|so_prefix|so_tag|
+---+---------+---------+------+
|  1|  so_java|       so|  java|
|  1|   so_jsp|       so|   jsp|
|  2|so_golang|       so|golang|
|  3| so_scala|       so| scala|
|  3|  so_akka|       so|  akka|
+---+---------+---------+------+



### Closing Spark Session

In [32]:
spark.stop()