// http://allaboutscala.com/big-data/spark/

# DataFrame introduction

## Create a DataFrame from reading a CSV file

In [1]:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}

Intitializing Scala interpreter ...

Spark Web UI available at http://172.16.8.92:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1587535826775)
SparkSession available as 'spark'


import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}


In [9]:
val path_data = "/Users/soda/Documents/GitHub/joutatsu-no-kotsu/resources/question_tags_10K.csv"
val dfTags = spark
    .read
    .option("header","true")
    .option("inferSchema","true")
    .csv(path_data)
    .toDF("id","tag")

path_data: String = /Users/soda/Documents/GitHub/joutatsu-no-kotsu/resources/question_tags_10K.csv
dfTags: org.apache.spark.sql.DataFrame = [id: int, tag: string]


In [10]:
dfTags.show(10)

+---+-------------------+
| id|                tag|
+---+-------------------+
|  1|               data|
|  4|                 c#|
|  4|           winforms|
|  4|    type-conversion|
|  4|            decimal|
|  4|            opacity|
|  6|               html|
|  6|                css|
|  6|               css3|
|  6|internet-explorer-7|
+---+-------------------+
only showing top 10 rows



In [11]:
dfTags.printSchema()

root
 |-- id: integer (nullable = true)
 |-- tag: string (nullable = true)



In [13]:
// DataFrame Query: select columns from a dataframe
dfTags.select("tag").show(10)

+-------------------+
|                tag|
+-------------------+
|               data|
|                 c#|
|           winforms|
|    type-conversion|
|            decimal|
|            opacity|
|               html|
|                css|
|               css3|
|internet-explorer-7|
+-------------------+
only showing top 10 rows



In [15]:
// DataFrame Query: filter by column value of a dataframe
dfTags.filter("tag=='php'").show(5)

+---+---+
| id|tag|
+---+---+
| 23|php|
| 42|php|
| 85|php|
|126|php|
|146|php|
+---+---+
only showing top 5 rows



In [16]:
// DataFrame Query: count rows of a dataframe
println(s"Number of php tags = ${dfTags.filter("tag == 'php'").count()}")

Number of php tags = 133


In [17]:
// DataFrame Query: SQL like query
dfTags.filter("tag like 's%'").show(5)

+---+-------------+
| id|          tag|
+---+-------------+
| 25|      sockets|
| 36|          sql|
| 36|   sql-server|
| 40| structuremap|
| 48|submit-button|
+---+-------------+
only showing top 5 rows



In [18]:
// DataFrame Query: Multiple filter chaining
dfTags
    .filter("tag like 's%'")
    .filter("id == 25 or id == 108")
    .show(5)

+---+-------+
| id|    tag|
+---+-------+
| 25|sockets|
|108|    svn|
+---+-------+



In [19]:
// DataFrame Query: SQL IN clause
dfTags.filter("id in (25,108)").show(5)

+---+---------+
| id|      tag|
+---+---------+
| 25|      c++|
| 25|        c|
| 25|  sockets|
| 25|mainframe|
| 25|      zos|
+---+---------+
only showing top 5 rows



In [20]:
// DataFrame Query: SQL Group By
println("Group by tag value")
dfTags.groupBy("tag").count().show(10)

Group by tag value
+--------------------+-----+
|                 tag|count|
+--------------------+-----+
|         type-safety|    4|
|             jbutton|    1|
|              iframe|    2|
|           svn-hooks|    2|
|           standards|    7|
|knowledge-management|    2|
|            trayicon|    1|
|           arguments|    1|
|                 zfs|    1|
|              import|    3|
+--------------------+-----+
only showing top 10 rows



In [21]:
// DataFrame Query: SQL Group By with filter
dfTags.groupBy("tag").count().filter("count > 5").show(5)

+-------------+-----+
|          tag|count|
+-------------+-----+
|    standards|    7|
|     keyboard|    8|
|          rss|   12|
|documentation|   15|
|      session|    6|
+-------------+-----+
only showing top 5 rows



In [22]:
// DataFrame Query: SQL order by
dfTags.groupBy("tag").count().filter("count >5")
    .orderBy("tag").show(2)

+--------+-----+
|     tag|count|
+--------+-----+
|    .net|  351|
|.net-2.0|   14|
+--------+-----+
only showing top 2 rows



In [23]:
// DataFrame Query: Cast columns to specific data type
val dfQuestionCSV = spark
    .read
    .option("header","true")
    .option("inferSchema","true")
    .option("dateFormat","yyyy-MM-dd HH:mm:ss")
    .csv("/Users/soda/Documents/GitHub/joutatsu-no-kotsu/resources/questions_10K.csv")
    .toDF("id","creation_date","closed_date", "deletion_date", 
          "score", "owner_userid", "answer_count")

dfQuestionCSV: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [24]:
dfQuestionCSV.printSchema

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: string (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: string (nullable = true)
 |-- answer_count: string (nullable = true)



In [25]:
dfQuestionCSV.show(2)

+---+-------------------+-----------+--------------------+-----+------------+------------+
| id|      creation_date|closed_date|       deletion_date|score|owner_userid|answer_count|
+---+-------------------+-----------+--------------------+-----+------------+------------+
|  1|2008-08-01 05:26:37|         NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-08-01 05:42:52|         NA|                  NA|  472|           8|          13|
+---+-------------------+-----------+--------------------+-----+------------+------------+
only showing top 2 rows



In [26]:
val dfQuestions = dfQuestionCSV.select(
    dfQuestionCSV.col("id").cast("integer"),
    dfQuestionCSV.col("creation_date").cast("timestamp"),
    dfQuestionCSV.col("closed_date").cast("timestamp"),
    dfQuestionCSV.col("deletion_date").cast("date"),
    dfQuestionCSV.col("score").cast("integer"),
    dfQuestionCSV.col("owner_userid").cast("integer"),
    dfQuestionCSV.col("answer_count").cast("integer")
)

dfQuestions: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [28]:
dfQuestions.printSchema

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



In [29]:
dfQuestions.show(3)

+---+-------------------+-----------+-------------+-----+------------+------------+
| id|      creation_date|closed_date|deletion_date|score|owner_userid|answer_count|
+---+-------------------+-----------+-------------+-----+------------+------------+
|  1|2008-08-01 05:26:37|       null|   2011-03-28|    1|        null|           0|
|  4|2008-08-01 05:42:52|       null|         null|  472|           8|          13|
|  6|2008-08-01 06:08:08|       null|         null|  210|           9|           5|
+---+-------------------+-----------+-------------+-----+------------+------------+
only showing top 3 rows



In [30]:
// DataFrame Query: Operate on a sliced dataframe
val dfQuestionsSubset = dfQuestions.filter("score > 400 and score < 410").toDF()
dfQuestionsSubset.show(2)

+----+-------------------+-------------------+-------------+-----+------------+------------+
|  id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|
+----+-------------------+-------------------+-------------+-----+------------+------------+
| 888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
|1939|2008-08-05 13:39:36|2012-06-05 21:13:38|   2012-12-18|  408|        null|          48|
+----+-------------------+-------------------+-------------+-----+------------+------------+
only showing top 2 rows



dfQuestionsSubset: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [32]:
// DataFrame Query: Join
dfQuestionsSubset.join(dfTags,"id")
    .select("owner_userid","tag","creation_date","score")
    .show(5)

+------------+---------+-------------------+-----+
|owner_userid|      tag|      creation_date|score|
+------------+---------+-------------------+-----+
|         131|      php|2008-08-04 07:18:21|  405|
|         131|  eclipse|2008-08-04 07:18:21|  405|
|         131|debugging|2008-08-04 07:18:21|  405|
|         131| phpstorm|2008-08-04 07:18:21|  405|
|         131|   xdebug|2008-08-04 07:18:21|  405|
+------------+---------+-------------------+-----+
only showing top 5 rows



In [34]:
// DataFrame Query: Join on explicit columns
dfQuestionsSubset
    .join(dfTags,dfTags("id")===dfQuestionsSubset("id"))
    .select("owner_userid","tag","creation_date","score")
    .show(3)

+------------+---------+-------------------+-----+
|owner_userid|      tag|      creation_date|score|
+------------+---------+-------------------+-----+
|         131|      php|2008-08-04 07:18:21|  405|
|         131|  eclipse|2008-08-04 07:18:21|  405|
|         131|debugging|2008-08-04 07:18:21|  405|
+------------+---------+-------------------+-----+
only showing top 3 rows



In [35]:
// DataFrame Query: Inner Join
dfQuestionsSubset
    .join(dfTags,Seq("id"),"inner")
    .select("owner_userid","tag","creation_date","score")
    .show(5)

+------------+---------+-------------------+-----+
|owner_userid|      tag|      creation_date|score|
+------------+---------+-------------------+-----+
|         131|      php|2008-08-04 07:18:21|  405|
|         131|  eclipse|2008-08-04 07:18:21|  405|
|         131|debugging|2008-08-04 07:18:21|  405|
|         131| phpstorm|2008-08-04 07:18:21|  405|
|         131|   xdebug|2008-08-04 07:18:21|  405|
+------------+---------+-------------------+-----+
only showing top 5 rows



In [37]:
// DataFrame Query: Left Outer Join
dfQuestionsSubset
    .join(dfTags, Seq("id"), "left_outer")
    .select("owner_userid","tag","creation_date","score")
    .show(5)

+------------+---------+-------------------+-----+
|owner_userid|      tag|      creation_date|score|
+------------+---------+-------------------+-----+
|         131|   xdebug|2008-08-04 07:18:21|  405|
|         131| phpstorm|2008-08-04 07:18:21|  405|
|         131|debugging|2008-08-04 07:18:21|  405|
|         131|  eclipse|2008-08-04 07:18:21|  405|
|         131|      php|2008-08-04 07:18:21|  405|
+------------+---------+-------------------+-----+
only showing top 5 rows



In [38]:
// DataFrame Query: Distinct
dfTags.select("tag").distinct().show(5)

+-----------+
|        tag|
+-----------+
|type-safety|
|    jbutton|
|     iframe|
|  svn-hooks|
|  standards|
+-----------+
only showing top 5 rows



In [39]:
// Register temp table from dataframe
dfTags.createOrReplaceTempView("so_tags")

In [2]:
// trait Context {

//   lazy val sparkConf = new SparkConf()
//     .setAppName("Learn Spark")
//     .setMaster("local[*]")
//     .set("spark.cores.max", "2")

//   lazy val sparkSession = SparkSession
//     .builder()
//     .config(sparkConf)
//     .getOrCreate()
// }

lazy val sparkSession = SparkSession
    .builder()
    .getOrCreate()

sparkSession: org.apache.spark.sql.SparkSession = <lazy>


In [45]:
// List all tables in Spark's catalog
sparkSession.catalog.listTables().show()

+-------+--------+-----------+---------+-----------+
|   name|database|description|tableType|isTemporary|
+-------+--------+-----------+---------+-----------+
|so_tags|    null|       null|TEMPORARY|       true|
+-------+--------+-----------+---------+-----------+



In [47]:
// List all tables in Spark's catalog using Spark SQL
sparkSession.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |  so_tags|       true|
+--------+---------+-----------+



In [48]:
// Select columns
sparkSession
    .sql("select id,tag from so_tags limit 10")
    .show()

+---+-------------------+
| id|                tag|
+---+-------------------+
|  1|               data|
|  4|                 c#|
|  4|           winforms|
|  4|    type-conversion|
|  4|            decimal|
|  4|            opacity|
|  6|               html|
|  6|                css|
|  6|               css3|
|  6|internet-explorer-7|
+---+-------------------+



In [49]:
// Filter by column value
sparkSession.sql("select * from so_tags where tag = 'php'")
    .show(5)

+---+---+
| id|tag|
+---+---+
| 23|php|
| 42|php|
| 85|php|
|126|php|
|146|php|
+---+---+
only showing top 5 rows



In [50]:
// Count number of rows
sparkSession.sql("""
    select count(*) as php_count from so_tags
        where tag = 'php'
                 """.stripMargin).show(10)

+---------+
|php_count|
+---------+
|      133|
+---------+



In [51]:
// SQL like
sparkSession.sql(
    """
    select * from so_tags
    where tag like 's%'
    """.stripMargin
    ).show(10)

+---+-------------+
| id|          tag|
+---+-------------+
| 25|      sockets|
| 36|          sql|
| 36|   sql-server|
| 40| structuremap|
| 48|submit-button|
| 79|          svn|
| 79|    subclipse|
| 85|          sql|
| 90|          svn|
|108|          svn|
+---+-------------+
only showing top 10 rows



In [52]:
// SQL where with and clause
sparkSession.sql(
    """
    select * from so_tags
    where tag like 's%'
    and (id = 25 or id = 108)
    """.stripMargin
    ).show(3)

+---+-------+
| id|    tag|
+---+-------+
| 25|sockets|
|108|    svn|
+---+-------+



In [53]:
// SQL IN clause
sparkSession.sql(
    """
    select * from so_tags
    where id in (25,108)
    """.stripMargin
    ).show(10)

+---+---------+
| id|      tag|
+---+---------+
| 25|      c++|
| 25|        c|
| 25|  sockets|
| 25|mainframe|
| 25|      zos|
|108|  windows|
|108|      svn|
|108|    64bit|
+---+---------+



In [55]:
// SQL Group By
sparkSession.sql(
    """
    select tag,count(*) as count
        from so_tags
        group by tag
    """.stripMargin
    ).show(5)

+-----------+-----+
|        tag|count|
+-----------+-----+
|type-safety|    4|
|    jbutton|    1|
|     iframe|    2|
|  svn-hooks|    2|
|  standards|    7|
+-----------+-----+
only showing top 5 rows



In [56]:
// SQL Group By with having clause
sparkSession.sql(
    """
    select tag,count(*) as count
    from so_tags 
    group by tag
    having count > 5
    """.stripMargin
    ).show(5)

+-------------+-----+
|          tag|count|
+-------------+-----+
|    standards|    7|
|     keyboard|    8|
|          rss|   12|
|documentation|   15|
|      session|    6|
+-------------+-----+
only showing top 5 rows



In [58]:
// SQL Order by
sparkSession.sql(
    """
    select tag, count(*) as count
    from so_tags
    group by tag
    having count >5
    order by count desc
    """.stripMargin
    ).show(5)


+-------------+-----+
|          tag|count|
+-------------+-----+
|stackoverflow|  382|
|           c#|  375|
|         .net|  351|
|      asp.net|  185|
|   sql-server|  167|
+-------------+-----+
only showing top 5 rows



In [62]:
// Typed columns, filter and create temp table
dfQuestionsSubset.createOrReplaceTempView("so_questions")

In [63]:
// SQL Inner Join
// SQL Left Outer Join
// SQL Right Outer Join
sparkSession.sql(
    """
    select t.*,q.*
    from so_questions as q
    inner join so_tags t
    on t.id = q.id
    """.stripMargin
    ).show(5)

+---+---------+---+-------------------+-------------------+-------------+-----+------------+------------+
| id|      tag| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|
+---+---------+---+-------------------+-------------------+-------------+-----+------------+------------+
|888|   xdebug|888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
|888| phpstorm|888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
|888|debugging|888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
|888|  eclipse|888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
|888|      php|888|2008-08-04 07:18:21|2016-08-04 17:22:00|         null|  405|         131|          30|
+---+---------+---+-------------------+-------------------+-------------+-----+------------+------------+
only showing top 5 rows



In [64]:
// SQL Distinct
sparkSession
    .sql(
        """
        select distinct tag from so_tags
        """.stripMargin
    ).show(10)

+--------------------+
|                 tag|
+--------------------+
|         type-safety|
|             jbutton|
|              iframe|
|           svn-hooks|
|           standards|
|knowledge-management|
|            trayicon|
|           arguments|
|                 zfs|
|              import|
+--------------------+
only showing top 10 rows



In [65]:
// **Register User Defined Function (UDF)**
def prefixStackoverflow(s:String):String = s"so_$s"

sparkSession.udf
    .register("prefix_so",prefixStackoverflow _)

prefixStackoverflow: (s: String)String
res51: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,StringType,Some(List(StringType)))


In [66]:
sparkSession.sql(
    """
    select id, prefix_so(tag) from so_tags
    """.stripMargin
    ).show(5)

+---+------------------+
| id|UDF:prefix_so(tag)|
+---+------------------+
|  1|           so_data|
|  4|             so_c#|
|  4|       so_winforms|
|  4|so_type-conversion|
|  4|        so_decimal|
+---+------------------+
only showing top 5 rows



In [67]:
// DataFrame Statistics Introduction
dfQuestions.show(4)

+---+-------------------+-------------------+-------------+-----+------------+------------+
| id|      creation_date|        closed_date|deletion_date|score|owner_userid|answer_count|
+---+-------------------+-------------------+-------------+-----+------------+------------+
|  1|2008-08-01 05:26:37|               null|   2011-03-28|    1|        null|           0|
|  4|2008-08-01 05:42:52|               null|         null|  472|           8|          13|
|  6|2008-08-01 06:08:08|               null|         null|  210|           9|           5|
|  8|2008-08-01 07:33:19|2013-06-03 12:00:25|   2015-02-11|   42|        null|           8|
+---+-------------------+-------------------+-------------+-----+------------+------------+
only showing top 4 rows



In [68]:
//Average
import org.apache.spark.sql.functions._
dfQuestions.select(avg("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



import org.apache.spark.sql.functions._


In [69]:
//Max
dfQuestions.select(max("score")).show()

+----------+
|max(score)|
+----------+
|      4443|
+----------+



In [70]:
//Minimum
dfQuestions.select(min("score")).show()

+----------+
|min(score)|
+----------+
|       -27|
+----------+



In [71]:
//Mean
dfQuestions.select(mean("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



In [72]:
//sum
dfQuestions.select(sum("score")).show()

+----------+
|sum(score)|
+----------+
|    361427|
+----------+



In [73]:
// Group by with statistics
dfQuestions.filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags,dfQuestions.col("id").equalTo(dfTags.col("id")))
    .groupBy(dfQuestions.col("owner_userid"))
    .agg(avg("score"),max("answer_count"))
    .show()

+------------+----------+-----------------+
|owner_userid|avg(score)|max(answer_count)|
+------------+----------+-----------------+
|         268|      26.0|                1|
|         136|      57.6|                9|
|         123|      20.0|                3|
+------------+----------+-----------------+



In [74]:
// DataFrame Statistics using describe() method
val dfSummary = dfQuestions.describe()
dfSummary.show()

+-------+-----------------+------------------+-----------------+------------------+
|summary|               id|             score|     owner_userid|      answer_count|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             9999|              9999|             7388|              9922|
|   mean|33929.17081708171| 36.14631463146315|47389.99472116947|6.6232614392259626|
| stddev|19110.09560532429|160.48316753972045|280943.1070344427| 9.069109116851138|
|    min|                1|               -27|                1|                -5|
|    max|            66037|              4443|          3431280|               316|
+-------+-----------------+------------------+-----------------+------------------+



dfSummary: org.apache.spark.sql.DataFrame = [summary: string, id: string ... 3 more fields]


In [75]:
// Correlation
val correlation = dfQuestions.stat.corr("score","answer_count")
println(s"correlation between column score and answer_count = $correlation")

correlation between column score and answer_count = 0.3699847903294707


correlation: Double = 0.3699847903294707


In [76]:
// Covariance
val covariance = dfQuestions.stat.cov("score","answer_count")
println(s"covariance between column score and answer_count = $covariance")

covariance between column score and answer_count = 537.513381444165


covariance: Double = 537.513381444165


In [77]:
// Frequent Items
val dfFrequentScore = dfQuestions.stat.freqItems(Seq("answer_count"))
dfFrequentScore.show()

+----------------------+
|answer_count_freqItems|
+----------------------+
|  [23, 131, 77, 86,...|
+----------------------+



dfFrequentScore: org.apache.spark.sql.DataFrame = [answer_count_freqItems: array<int>]


In [78]:
//Crosstab
val dfScoreByUserid = dfQuestions
    .filter("owner_userid > 0 and owner_userid < 20")
    .stat
    .crosstab("score","owner_userid")
dfScoreByUserid.show(5)

+------------------+---+---+---+---+---+---+---+---+---+---+
|score_owner_userid|  1| 11| 13| 17|  2|  3|  4|  5|  8|  9|
+------------------+---+---+---+---+---+---+---+---+---+---+
|                56|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|               472|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|                14|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|
|                20|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|               179|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|
+------------------+---+---+---+---+---+---+---+---+---+---+
only showing top 5 rows



dfScoreByUserid: org.apache.spark.sql.DataFrame = [score_owner_userid: string, 1: bigint ... 9 more fields]


In [80]:
// Stratified sampling using sampleBy
val dfQuestionsByAnswerCount = dfQuestions
    .filter("owner_userid > 0")
    .filter("answer_count in (5,10,20)")

dfQuestionsByAnswerCount.groupBy("answer_count")
    .count()
    .show()

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  811|
|          10|  272|
+------------+-----+



dfQuestionsByAnswerCount: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, creation_date: timestamp ... 5 more fields]


In [81]:
// Create a fraction map where we are only interested:
// - 50% of the rows that have answer_count = 5
// - 10% of the rows that have answer_count = 10
// - 100% of the rows that have answer_count = 20
// Note also that fractions should be in the range [0, 1]

val fractionKeyMap = Map(5->0.5, 10->0.1,20->1.0)

dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count",fractionKeyMap,7L)
    .groupBy("answer_count")
    .count().show()

//randomseed == 7L

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  400|
|          10|   26|
+------------+-----+



fractionKeyMap: scala.collection.immutable.Map[Int,Double] = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)


In [83]:
// Approximate Quantile
val quantiles = dfQuestions
    .stat
    .approxQuantile("score",Array(0,0.5,1),0.25)
println(s"Quantitles segments = ${quantiles.toSeq}")

Quantitles segments = WrappedArray(-27.0, 2.0, 4443.0)


quantiles: Array[Double] = Array(-27.0, 2.0, 4443.0)


In [84]:
// Bloom Filter
val tagsBloomFilter = dfTags.stat.bloomFilter("tag",1000L,0.1)

tagsBloomFilter: org.apache.spark.util.sketch.BloomFilter = org.apache.spark.util.sketch.BloomFilterImpl@809c4023


In [85]:
println(s"bloom filter contains java tag = ${tagsBloomFilter.mightContain("java")}")

bloom filter contains java tag = true


In [86]:
println(s"bloom filter contains some unknown tag = ${tagsBloomFilter.mightContain("unknown tag")}")

bloom filter contains some unknown tag = false


In [87]:
// Count Min Sketch
    // first parameter = the tag column of dataframe dfTags
    // second parameter = 10% precision error factor
    // third parameter = 90% confidence level
    // fourth parameter = 37 as a random seed

val cmsTag = dfTags.stat.countMinSketch("tag", 0.1, 0.9, 37)
val estimatedFrequency = cmsTag.estimateCount("java")
println(s"Estimated frequency for tag java = $estimatedFrequency")

Estimated frequency for tag java = 513


cmsTag: org.apache.spark.util.sketch.CountMinSketch = org.apache.spark.util.sketch.CountMinSketchImpl@431a88ed
estimatedFrequency: Long = 513


In [88]:
// Sampling With Replacement
    // with replacement = true
    // number of rows to sample = 20%
    // a random seed = 37L
val dfTagsSample = dfTags.sample(true,0.2,37L)
println(s"Number of rows in sample dfTagsSample = ${dfTagsSample.count()}")
println(s"Number of rows in dfTags = ${dfTags.count()}")

Number of rows in sample dfTagsSample = 1948
Number of rows in dfTags = 9999


dfTagsSample: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, tag: string]


`DataFrame Operations Introduction`

In [91]:
val dfQuestions = dfQuestionCSV
    .filter("score > 400 and score < 410")
    .join(dfTags,"id")
    .select("owner_userid","tag","creation_date","score")
    .toDF()

dfQuestions: org.apache.spark.sql.DataFrame = [owner_userid: string, tag: string ... 2 more fields]


In [None]:
// dfTags.show(2);dfQuestionCSV.show(2);dfQuestions.show(2)

In [92]:
// Convert DataFrame row to Scala case class
case class Tag(id: Int,tag: String)

defined class Tag


In [93]:
dfTags.show(2)

+---+----+
| id| tag|
+---+----+
|  1|data|
|  4|  c#|
+---+----+
only showing top 2 rows



In [100]:
import spark.implicits._
import org.apache.spark.sql.Dataset
val dfTagsOfTag: Dataset[Tag] = dfTags.as[Tag]

import spark.implicits._
import org.apache.spark.sql.Dataset
dfTagsOfTag: org.apache.spark.sql.Dataset[Tag] = [id: int, tag: string]


In [101]:
dfTagsOfTag
    .take(10)
    .foreach(t => println(s"id = ${t.id}, tag = ${t.tag}"))

id = 1, tag = data
id = 4, tag = c#
id = 4, tag = winforms
id = 4, tag = type-conversion
id = 4, tag = decimal
id = 4, tag = opacity
id = 6, tag = html
id = 6, tag = css
id = 6, tag = css3
id = 6, tag = internet-explorer-7


In [102]:
// DataFrame row to Scala case class using map()
case class Question(owner_userid: Int, tag: String,
                   creationDate: java.sql.Timestamp, score: Int)

defined class Question


In [103]:
dfQuestions.show(2)

+------------+--------+-------------------+-----+
|owner_userid|     tag|      creation_date|score|
+------------+--------+-------------------+-----+
|         131|  xdebug|2008-08-04 07:18:21|  405|
|         131|phpstorm|2008-08-04 07:18:21|  405|
+------------+--------+-------------------+-----+
only showing top 2 rows



In [105]:
def toQuestion(row: org.apache.spark.sql.Row): Question = {
// to normalize our owner_userid data
    val IntOf: String => Option[Int] = _ match {
  case s if s == "NA" => None
  case s => Some(s.toInt)
    }
    
    import java.time._
    val DateOf: String => java.sql.Timestamp = _ match {
      case s => java.sql.Timestamp.valueOf(ZonedDateTime.parse(s).toLocalDateTime)
    }

    Question (
      owner_userid = IntOf(row.getString(0)).getOrElse(-1),
      tag = row.getString(1),
      creationDate = DateOf(row.getString(2)),
      score = row.getString(3).toInt
    )
}

toQuestion: (row: org.apache.spark.sql.Row)Question


In [106]:
import spark.implicits._
val dfOfQuestion: Dataset[Question] = dfQuestions.map(row => toQuestion(row))  

import spark.implicits._
dfOfQuestion: org.apache.spark.sql.Dataset[Question] = [owner_userid: int, tag: string ... 2 more fields]


In [114]:
// Create DataFrame from collection
val segTags = Seq(
    1 -> "so_java",
    1 -> "so_jsp",
    2 -> "so_erlang",
    3 -> "so_scala",
    4 -> "so_akka"
)

import spark.implicits._
val dfMoreTags = segTags.toDF("id","tag")
dfMoreTags.show()

+---+---------+
| id|      tag|
+---+---------+
|  1|  so_java|
|  1|   so_jsp|
|  2|so_erlang|
|  3| so_scala|
|  4|  so_akka|
+---+---------+



segTags: Seq[(Int, String)] = List((1,so_java), (1,so_jsp), (2,so_erlang), (3,so_scala), (4,so_akka))
import spark.implicits._
dfMoreTags: org.apache.spark.sql.DataFrame = [id: int, tag: string]


In [115]:
// DataFrame Union
val dfUnionOfTags = dfTags.union(dfMoreTags)
    .filter("id in (1,3)")
dfUnionOfTags.show()

+---+--------+
| id|     tag|
+---+--------+
|  1|    data|
|  1| so_java|
|  1|  so_jsp|
|  3|so_scala|
+---+--------+



dfUnionOfTags: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, tag: string]


In [116]:
// DataFrame Intersection
val dfIntersectionTags = dfMoreTags.intersect(dfUnionOfTags).show(10)

+---+--------+
| id|     tag|
+---+--------+
|  3|so_scala|
|  1| so_java|
|  1|  so_jsp|
+---+--------+



dfIntersectionTags: Unit = ()


In [117]:
// Append column to DataFrame using withColumn()
import org.apache.spark.sql.functions._
val dfSplitColumn = dfMoreTags
    .withColumn("tmp",split($"tag","_"))
    .select(
    $"id",
    $"tag",
    $"tmp".getItem(0).as("so_prefix"),
    $"tmp".getItem(1).as("so_tag"))
    .drop("tmp")

import org.apache.spark.sql.functions._
dfSplitColumn: org.apache.spark.sql.DataFrame = [id: int, tag: string ... 2 more fields]


In [118]:
dfSplitColumn.show()

+---+---------+---------+------+
| id|      tag|so_prefix|so_tag|
+---+---------+---------+------+
|  1|  so_java|       so|  java|
|  1|   so_jsp|       so|   jsp|
|  2|so_erlang|       so|erlang|
|  3| so_scala|       so| scala|
|  4|  so_akka|       so|  akka|
+---+---------+---------+------+



Spark SQL Introduction

In [3]:
// Create DataFrame from Tuples
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = sparkSession
 .createDataFrame(donuts)
 .toDF("Donut Name", "Price")

donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]


In [4]:
df.show()

+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



In [5]:
// Get DataFrame column names
val columnNames: Array[String] = df.columns
columnNames.foreach(name => println(s"$name"))

Donut Name
Price


columnNames: Array[String] = Array(Donut Name, Price)


In [12]:
// DataFrame column names and types
val (columnNames, columnDataTypes) = df.dtypes.unzip
println(s"DataFrame column names = ${columnNames.mkString(",")}")
println(s"DataFrame column data types = ${columnDataTypes.mkString(",")}")

DataFrame column names = Donut Name,Price
DataFrame column data types = StringType,DoubleType


columnNames: Array[String] = Array(Donut Name, Price)
columnDataTypes: Array[String] = Array(StringType, DoubleType)


In [17]:
// Json into DataFrame using explode()
import sparkSession.sqlContext.implicits._
val tagsDF = sparkSession
    .read
    .option("multiline",true)
    .option("inferSchema",true)
    .json("/Users/soda/Downloads/jsonData.json")

import sparkSession.sqlContext.implicits._
tagsDF: org.apache.spark.sql.DataFrame = [stackoverflow: array<struct<tag:struct<author:string,frameworks:array<struct<id:bigint,name:string>>,id:bigint,name:string>>>]


In [18]:
val df = tagsDF.select(explode($"stackoverflow") as "stackoverflow_tags")

df: org.apache.spark.sql.DataFrame = [stackoverflow_tags: struct<tag: struct<author: string, frameworks: array<struct<id:bigint,name:string>> ... 2 more fields>>]


In [20]:
df.select(
    $"stackoverflow_tags.tag.id" as "id",
    $"stackoverflow_tags.tag.author" as "author",
    $"stackoverflow_tags.tag.name" as "tag_name",
    $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id",
    $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name"
  ).show()

+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
|  2| James Gosling|    java|       [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+



In [14]:
// Concatenate DataFrames using join()
val donuts = Seq(("111","plain donut", 1.50), ("222", "vanilla donut", 2.0), ("333","glazed donut", 2.50))

val dfDonuts = sparkSession
    .createDataFrame(donuts)
    .toDF("Id","Donut Name", "Price")

dfDonuts.show()

+---+-------------+-----+
| Id|   Donut Name|Price|
+---+-------------+-----+
|111|  plain donut|  1.5|
|222|vanilla donut|  2.0|
|333| glazed donut|  2.5|
+---+-------------+-----+



donuts: Seq[(String, String, Double)] = List((111,plain donut,1.5), (222,vanilla donut,2.0), (333,glazed donut,2.5))
dfDonuts: org.apache.spark.sql.DataFrame = [Id: string, Donut Name: string ... 1 more field]


In [15]:
val inventory = Seq(("111", 10), ("222", 20), ("333", 30))
val dfInventory = sparkSession
      .createDataFrame(inventory)
      .toDF("Id", "Inventory")

dfInventory.show()

+---+---------+
| Id|Inventory|
+---+---------+
|111|       10|
|222|       20|
|333|       30|
+---+---------+



inventory: Seq[(String, Int)] = List((111,10), (222,20), (333,30))
dfInventory: org.apache.spark.sql.DataFrame = [Id: string, Inventory: int]


In [16]:
val dfDonutsInventory = dfDonuts.join(dfInventory,Seq("Id"),"inner")
dfDonutsInventory.show()

+---+-------------+-----+---------+
| Id|   Donut Name|Price|Inventory|
+---+-------------+-----+---------+
|111|  plain donut|  1.5|       10|
|222|vanilla donut|  2.0|       20|
|333| glazed donut|  2.5|       30|
+---+-------------+-----+---------+



dfDonutsInventory: org.apache.spark.sql.DataFrame = [Id: string, Donut Name: string ... 2 more fields]


In [23]:
val df = tagsDF
    .select(explode($"stackoverflow") as "stackoverflow_tags")
    .select(
      $"stackoverflow_tags.tag.id" as "id",
      $"stackoverflow_tags.tag.author" as "author",
      $"stackoverflow_tags.tag.name" as "tag_name",
      $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id",
      $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name"
    )
df.show()

+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
|  2| James Gosling|    java|       [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+



df: org.apache.spark.sql.DataFrame = [id: bigint, author: string ... 3 more fields]


In [26]:
df
    .select("*")
    .where(array_contains($"frameworks_name","Play Framework"))
    .show()


+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
+---+--------------+--------+-------------+--------------------+



In [27]:
// Check DataFrame column exists
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = sparkSession.createDataFrame(donuts).toDF("Donut Name", "Price")

df.columns.contains("Price")

donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]
res18: Boolean = true


In [28]:
// Split DataFrame Array column
val targets = Seq(("Plain Donut", Array(1.50, 2.0)), ("Vanilla Donut", Array(2.0, 2.50)), ("Strawberry Donut", Array(2.50, 3.50)))
val df = sparkSession
    .createDataFrame(targets)
    .toDF("Name", "Prices")

df.show()

+----------------+----------+
|            Name|    Prices|
+----------------+----------+
|     Plain Donut|[1.5, 2.0]|
|   Vanilla Donut|[2.0, 2.5]|
|Strawberry Donut|[2.5, 3.5]|
+----------------+----------+



targets: Seq[(String, Array[Double])] = List((Plain Donut,Array(1.5, 2.0)), (Vanilla Donut,Array(2.0, 2.5)), (Strawberry Donut,Array(2.5, 3.5)))
df: org.apache.spark.sql.DataFrame = [Name: string, Prices: array<double>]


In [29]:
df.printSchema

root
 |-- Name: string (nullable = true)
 |-- Prices: array (nullable = true)
 |    |-- element: double (containsNull = false)



In [30]:
df.select(
    $"Name",
    $"Prices"(0).as("Low Price"),
    $"Prices"(1).as("High Price")
    ).show()

+----------------+---------+----------+
|            Name|Low Price|High Price|
+----------------+---------+----------+
|     Plain Donut|      1.5|       2.0|
|   Vanilla Donut|      2.0|       2.5|
|Strawberry Donut|      2.5|       3.5|
+----------------+---------+----------+



In [37]:
// Rename DataFrame column
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")
df.show()


+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]


In [39]:
df.withColumnRenamed("Donut Name", "Name").show()
//注意：将“Donut Name”替换为"Name"

+-------------+-----+
|         Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



In [40]:
// Create DataFrame constant column
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")

import org.apache.spark.sql.functions._
df.withColumn("Tasty",lit(true))
    .withColumn("Correlation",lit(1.0))
    .withColumn("Stock",typedLit(Seq(100,500))).show()

+-------------+-----+-----+-----------+----------+
|   Donut Name|Price|Tasty|Correlation|     Stock|
+-------------+-----+-----+-----------+----------+
|  plain donut|  1.5| true|        1.0|[100, 500]|
|vanilla donut|  2.0| true|        1.0|[100, 500]|
| glazed donut|  2.5| true|        1.0|[100, 500]|
+-------------+-----+-----+-----------+----------+



donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]
import org.apache.spark.sql.functions._


In [41]:
// DataFrame new column with User Defined Function (UDF)
val donuts = Seq(("plain donut", 1.50), 
                 ("vanilla donut", 2.0), 
                 ("glazed donut", 2.50))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")

import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._

val stockMinMax: (String => Seq[Int]) = 
    (donutName: String) => donutName match{
        case "plain donut" => Seq(100,500)
        case "vanilla donut" => Seq(200,400)
        case "glazed donut" => Seq(200,600)
        case _ => Seq(150,150)
    }

val udfStockMinMax = udf(stockMinMax)

val df2 = df.withColumn("Stock Min Max",udfStockMinMax($"Donut Name"))

donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]
import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._
stockMinMax: String => Seq[Int] = <function1>
udfStockMinMax: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,ArrayType(IntegerType,false),Some(List(StringType)))
df2: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double ... 1 more field]


In [42]:
df2.show()

+-------------+-----+-------------+
|   Donut Name|Price|Stock Min Max|
+-------------+-----+-------------+
|  plain donut|  1.5|   [100, 500]|
|vanilla donut|  2.0|   [200, 400]|
| glazed donut|  2.5|   [200, 600]|
+-------------+-----+-------------+



In [44]:
// DataFrame First Row
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = sparkSession
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price")
df.show()

+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



donuts: Seq[(String, Double)] = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double]


In [49]:
println(df.first(),
        df.first().get(0),
        df.first().getAs[Double]("Price"))

([plain donut,1.5],plain donut,1.5)


In [50]:
// Format DataFrame column
val donuts = Seq(("plain donut", 1.50, "2018-04-17"), ("vanilla donut", 2.0, "2018-04-01"), ("glazed donut", 2.50, "2018-04-02"))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price", "Purchase Date")

import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._

df
.withColumn("Price Formatted", format_number($"Price", 2))
.withColumn("Name Formatted", format_string("awesome %s", $"Donut Name"))
.withColumn("Name Uppercase", upper($"Donut Name"))
.withColumn("Name Lowercase", lower($"Donut Name"))
.withColumn("Date Formatted", date_format($"Purchase Date", "yyyyMMdd"))
.withColumn("Day", dayofmonth($"Purchase Date"))
.withColumn("Month", month($"Purchase Date"))
.withColumn("Year", year($"Purchase Date"))
.show()

+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+
|   Donut Name|Price|Purchase Date|Price Formatted|      Name Formatted|Name Uppercase|Name Lowercase|Date Formatted|Day|Month|Year|
+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+
|  plain donut|  1.5|   2018-04-17|           1.50| awesome plain donut|   PLAIN DONUT|   plain donut|      20180417| 17|    4|2018|
|vanilla donut|  2.0|   2018-04-01|           2.00|awesome vanilla d...| VANILLA DONUT| vanilla donut|      20180401|  1|    4|2018|
| glazed donut|  2.5|   2018-04-02|           2.50|awesome glazed donut|  GLAZED DONUT|  glazed donut|      20180402|  2|    4|2018|
+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+



donuts: Seq[(String, Double, String)] = List((plain donut,1.5,2018-04-17), (vanilla donut,2.0,2018-04-01), (glazed donut,2.5,2018-04-02))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double ... 1 more field]
import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._


In [51]:
// DataFrame String Functions
val donuts = Seq(("plain donut", 1.50, "2018-04-17"), ("vanilla donut", 2.0, "2018-04-01"), ("glazed donut", 2.50, "2018-04-02"))
val df = spark
	.createDataFrame(donuts)
	.toDF("Donut Name", "Price", "Purchase Date")

import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._

df
.withColumn("Contains plain", instr($"Donut Name", "donut"))
.withColumn("Length", length($"Donut Name"))
.withColumn("Trim", trim($"Donut Name"))
.withColumn("LTrim", ltrim($"Donut Name"))
.withColumn("RTrim", rtrim($"Donut Name"))
.withColumn("Reverse", reverse($"Donut Name"))
.withColumn("Substring", substring($"Donut Name", 0, 5))
.withColumn("IsNull", isnull($"Donut Name"))
.withColumn("Concat", concat_ws(" - ", $"Donut Name", $"Price"))
.withColumn("InitCap", initcap($"Donut Name"))
.show()

+-------------+-----+-------------+--------------+------+-------------+-------------+-------------+-------------+---------+------+-------------------+-------------+
|   Donut Name|Price|Purchase Date|Contains plain|Length|         Trim|        LTrim|        RTrim|      Reverse|Substring|IsNull|             Concat|      InitCap|
+-------------+-----+-------------+--------------+------+-------------+-------------+-------------+-------------+---------+------+-------------------+-------------+
|  plain donut|  1.5|   2018-04-17|             7|    11|  plain donut|  plain donut|  plain donut|  tunod nialp|    plain| false|  plain donut - 1.5|  Plain Donut|
|vanilla donut|  2.0|   2018-04-01|             9|    13|vanilla donut|vanilla donut|vanilla donut|tunod allinav|    vanil| false|vanilla donut - 2.0|Vanilla Donut|
| glazed donut|  2.5|   2018-04-02|             8|    12| glazed donut| glazed donut| glazed donut| tunod dezalg|    glaze| false| glazed donut - 2.5| Glazed Donut|
+---------

donuts: Seq[(String, Double, String)] = List((plain donut,1.5,2018-04-17), (vanilla donut,2.0,2018-04-01), (glazed donut,2.5,2018-04-02))
df: org.apache.spark.sql.DataFrame = [Donut Name: string, Price: double ... 1 more field]
import org.apache.spark.sql.functions._
import spark.sqlContext.implicits._


In [None]:
// DataFrame drop null
dfWithNull.na.drop()

1. http://localhost:8888/notebooks/Downloads/backUp/dataset/GPS/%F0%9F%9A%98RAI%20%E2%9C%A8.ipynb
2. http://localhost:8888/notebooks/Downloads/backUp/dataset/GPS/%F0%9F%A6%88GPSpipeline.ipynb