### Imports

In [1]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.SparkSession


### Creating Spark Session

In [2]:
val spark = SparkSession.builder().getOrCreate()

2020-01-19 15:53:44 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@453acf51


### Create a DataFrame from reading a CSV file

In [3]:
val dfTags = spark.read.option("header", "true").option("inferSchema", "true").csv("..\\Resources\\question_tags_10K.csv")

dfTags: org.apache.spark.sql.DataFrame = [Id: int, Tag: string]


### Show

In [4]:
dfTags.show(5)

+---+---------------+
| Id|            Tag|
+---+---------------+
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
|  4|        decimal|
+---+---------------+
only showing top 5 rows



### Print DataFrame schema

In [5]:
dfTags.printSchema

root
 |-- Id: integer (nullable = true)
 |-- Tag: string (nullable = true)



### Query dataframe: select columns from a dataframe

In [6]:
dfTags.select($"id",$"tag").show(5)

+---+---------------+
| id|            tag|
+---+---------------+
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
|  4|        decimal|
+---+---------------+
only showing top 5 rows



### DataFrame Query: filter by column value of a dataframe

In [7]:
// SQL Syntax
dfTags.filter("tag == 'php'").show(5)

+---+---+
| Id|Tag|
+---+---+
| 23|php|
| 42|php|
| 85|php|
|126|php|
|146|php|
+---+---+
only showing top 5 rows



Or

In [8]:
// Scala Syntax
dfTags.filter($"tag" === "php").show(5)

+---+---+
| Id|Tag|
+---+---+
| 23|php|
| 42|php|
| 85|php|
|126|php|
|146|php|
+---+---+
only showing top 5 rows



### DataFrame Query: count rows of a dataframe

In [9]:
println(s"Number of php tags = ${dfTags.filter("tag == 'php'").count()}")

Number of php tags = 133


### DataFrame Query: SQL like query

In [10]:
dfTags.filter("tag like 's%'").show(5)

+---+-------------+
| Id|          Tag|
+---+-------------+
| 25|      sockets|
| 36|          sql|
| 36|   sql-server|
| 40| structuremap|
| 48|submit-button|
+---+-------------+
only showing top 5 rows



### DataFrame Query: Multiple filter chaining

In [11]:
dfTags.filter("tag like 's%'")
      .filter("id == 25 or id == 108")
      .show(5)

+---+-------+
| Id|    Tag|
+---+-------+
| 25|sockets|
|108|    svn|
+---+-------+



### DataFrame Query: SQL IN clause

In [12]:
dfTags.filter("id in (25,108)").show(5)

+---+---------+
| Id|      Tag|
+---+---------+
| 25|      c++|
| 25|        c|
| 25|  sockets|
| 25|mainframe|
| 25|      zos|
+---+---------+
only showing top 5 rows



### DataFrame Query: SQL Group By

In [14]:
dfTags.groupBy("tag").count().show(5)

+-----------+-----+
|        tag|count|
+-----------+-----+
|type-safety|    4|
|    jbutton|    1|
|     iframe|    2|
|  svn-hooks|    2|
|  standards|    7|
+-----------+-----+
only showing top 5 rows



### DataFrame Query: SQL Group By with filter

In [17]:
dfTags.groupBy("tag").count().filter("count > 5").show(5)

+-------------+-----+
|          tag|count|
+-------------+-----+
|    standards|    7|
|     keyboard|    8|
|          rss|   12|
|documentation|   15|
|      session|    6|
+-------------+-----+
only showing top 5 rows



### DataFrame Query: SQL order by (Ascending)

In [18]:
dfTags.groupBy("tag").count().filter("count > 5").orderBy("tag").show(5)

+--------------+-----+
|           tag|count|
+--------------+-----+
|          .net|  351|
|      .net-2.0|   14|
|      .net-3.5|   30|
|         64bit|    7|
|actionscript-3|   22|
+--------------+-----+
only showing top 5 rows



### DataFrame Query: SQL order by (Descending)

In [19]:
import org.apache.spark.sql.functions

import org.apache.spark.sql.functions


In [22]:
dfTags.groupBy("tag").count().filter("count > 5").orderBy(dfTags("tag").desc).show(5)

+-----+-----+
|  tag|count|
+-----+-----+
|  xss|    6|
| xslt|    7|
|  xml|   52|
|xhtml|    8|
|  wpf|   23|
+-----+-----+
only showing top 5 rows



### DataFrame Query: Cast columns to specific data type

In [24]:
val dfQuestionsCSV = spark.read
                          .option("header","true")
                          .option("inferSchema", "true")
                          .option("dateFormat","yyyy-MM-dd HH:mm:ss")
                          .csv("..\\Resources\\questions_10K.csv")

dfQuestionsCSV: org.apache.spark.sql.DataFrame = [Id: int, CreationDate: timestamp ... 5 more fields]


In [25]:
dfQuestionsCSV.show(3)

+---+-------------------+----------+--------------------+-----+-----------+-----------+
| Id|       CreationDate|ClosedDate|        DeletionDate|Score|OwnerUserId|AnswerCount|
+---+-------------------+----------+--------------------+-----+-----------+-----------+
|  1|2008-08-01 02:56:37|        NA|2011-03-28T00:53:47Z|    1|         NA|          0|
|  4|2008-08-01 03:12:52|        NA|                  NA|  472|          8|         13|
|  6|2008-08-01 03:38:08|        NA|                  NA|  210|          9|          5|
+---+-------------------+----------+--------------------+-----+-----------+-----------+
only showing top 3 rows



### DataFrame creation by giving input column names

In [26]:
val dfQuestionsCSV = spark.read
                          .option("header","true")
                          .option("inferSchema", "true")
                          .option("dateFormat","yyyy-MM-dd HH:mm:ss")
                          .csv("..\\Resources\\questions_10K.csv")
                          .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")

dfQuestionsCSV: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [27]:
dfQuestionsCSV.show(3)

+---+-------------------+-----------+--------------------+-----+------------+------------+
| id|      creation_date|closed_date|       deletion_date|score|owner_userid|answer_count|
+---+-------------------+-----------+--------------------+-----+------------+------------+
|  1|2008-08-01 02:56:37|         NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-08-01 03:12:52|         NA|                  NA|  472|           8|          13|
|  6|2008-08-01 03:38:08|         NA|                  NA|  210|           9|           5|
+---+-------------------+-----------+--------------------+-----+------------+------------+
only showing top 3 rows



### Checking Existing Schema

In [28]:
dfQuestionsCSV.printSchema

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: string (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: string (nullable = true)
 |-- answer_count: string (nullable = true)



### Casting

In [29]:
val dfQuestions = dfQuestionsCSV.select( dfQuestionsCSV.col("id").cast("integer"),
                                     dfQuestionsCSV.col("creation_date").cast("timestamp"),
                                     dfQuestionsCSV.col("closed_date").cast("timestamp"),
                                     dfQuestionsCSV.col("deletion_date").cast("date"),
                                     dfQuestionsCSV.col("score").cast("integer"),
                                     dfQuestionsCSV.col("owner_userid").cast("integer"),
                                     dfQuestionsCSV.col("answer_count").cast("integer")
                                    )

dfQuestions: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [30]:
dfQuestions.show(3)

+---+-------------------+-----------+-------------+-----+------------+------------+
| id|      creation_date|closed_date|deletion_date|score|owner_userid|answer_count|
+---+-------------------+-----------+-------------+-----+------------+------------+
|  1|2008-08-01 02:56:37|       null|   2011-03-28|    1|        null|           0|
|  4|2008-08-01 03:12:52|       null|         null|  472|           8|          13|
|  6|2008-08-01 03:38:08|       null|         null|  210|           9|           5|
+---+-------------------+-----------+-------------+-----+------------+------------+
only showing top 3 rows



### Checking new schema

In [31]:
dfQuestions.printSchema

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



### DataFrame Query: Operate on a sliced dataframe

In [34]:
val dfQuestionsSubset = dfQuestions.filter("score > 400 and score < 410").toDF()

dfQuestionsSubset: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [35]:
dfQuestionsSubset.show(3)

+----+-------------------+-------------------+-------------+-----+------------+------------+
|  id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|
+----+-------------------+-------------------+-------------+-----+------------+------------+
| 888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|
|1939|2008-08-05 11:09:36|2012-06-05 18:43:38|   2012-12-18|  408|        null|          48|
|3881|2008-08-07 00:56:30|2016-09-23 19:04:31|         null|  402|         122|          27|
+----+-------------------+-------------------+-------------+-----+------------+------------+
only showing top 3 rows



### DataFrame Query: Join

In [36]:
dfQuestionsSubset.join(dfTags, "id").show(5)

+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|      Tag|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|   xdebug|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30| phpstorm|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|debugging|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|  eclipse|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|      php|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
only showing top 5 rows



### DataFrame Query: Join and select columns

In [37]:
dfQuestionsSubset.join(dfTags, "id").select("owner_userid", "tag", "creation_date", "score").show(5)

+------------+---------+-------------------+-----+
|owner_userid|      tag|      creation_date|score|
+------------+---------+-------------------+-----+
|         131|      php|2008-08-04 04:48:21|  405|
|         131|  eclipse|2008-08-04 04:48:21|  405|
|         131|debugging|2008-08-04 04:48:21|  405|
|         131| phpstorm|2008-08-04 04:48:21|  405|
|         131|   xdebug|2008-08-04 04:48:21|  405|
+------------+---------+-------------------+-----+
only showing top 5 rows



### DataFrame Query: Join on explicit columns

In [38]:
dfQuestionsSubset.join(dfTags, dfTags("id") === dfQuestionsSubset("id")).show(5)

+---+-------------------+-------------------+-------------+-----+------------+------------+---+---------+
| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count| Id|      Tag|
+---+-------------------+-------------------+-------------+-----+------------+------------+---+---------+
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|888|   xdebug|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|888| phpstorm|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|888|debugging|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|888|  eclipse|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|888|      php|
+---+-------------------+-------------------+-------------+-----+------------+------------+---+---------+
only showing top 5 rows



### DataFrame Query: Inner Join

In [39]:
dfQuestionsSubset.join(dfTags, Seq("id"), "inner").show(5)

+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|      Tag|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|   xdebug|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30| phpstorm|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|debugging|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|  eclipse|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|      php|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
only showing top 5 rows



### DataFrame Query: Left Outer Join

In [40]:
dfQuestionsSubset.join(dfTags, Seq("id"), "left_outer").show(5)

+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|      Tag|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|   xdebug|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30| phpstorm|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|debugging|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|  eclipse|
|888|2008-08-04 04:48:21|2016-08-04 14:52:00|         null|  405|         131|          30|      php|
+---+-------------------+-------------------+-------------+-----+------------+------------+---------+
only showing top 5 rows



### DataFrame Query: Right Outer Join

In [41]:
dfQuestionsSubset.join(dfTags, Seq("id"), "right_outer").show(5)

+---+-------------+-----------+-------------+-----+------------+------------+---------------+
| Id|creation_date|closed_date|deletion_date|score|owner_userid|answer_count|            Tag|
+---+-------------+-----------+-------------+-----+------------+------------+---------------+
|  1|         null|       null|         null| null|        null|        null|           data|
|  4|         null|       null|         null| null|        null|        null|             c#|
|  4|         null|       null|         null| null|        null|        null|       winforms|
|  4|         null|       null|         null| null|        null|        null|type-conversion|
|  4|         null|       null|         null| null|        null|        null|        decimal|
+---+-------------+-----------+-------------+-----+------------+------------+---------------+
only showing top 5 rows



### DataFrame Query: Distinct

In [42]:
dfTags.select("tag").distinct().show(5)

+-----------+
|        tag|
+-----------+
|type-safety|
|    jbutton|
|     iframe|
|  svn-hooks|
|  standards|
+-----------+
only showing top 5 rows



### Closing Spark Session

In [43]:
spark.stop()