### Imports

In [1]:
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1579453089921)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession


### Creating Spark Session

In [2]:
val spark = SparkSession.builder().getOrCreate()

2020-01-19 22:28:24 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@4b59fc05


### Create a dataframe from tags file question_tags_10K.csv

In [4]:
val dfTags = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("..\\Resources\\question_tags_10K.csv")
    .toDF("id", "tag")

dfTags: org.apache.spark.sql.DataFrame = [id: int, tag: string]


In [5]:
dfTags.show(5)

+---+---------------+
| id|            tag|
+---+---------------+
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
|  4|        decimal|
+---+---------------+
only showing top 5 rows



### Create a dataframe from questions file questions_10K.csv

In [6]:
val dfQuestionsCSV = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("dateFormat","yyyy-MM-dd HH:mm:ss")
    .csv("..\\Resources\\questions_10K.csv")
    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")

dfQuestionsCSV: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [7]:
dfQuestionsCSV.show(5)

+---+-------------------+--------------------+--------------------+-----+------------+------------+
| id|      creation_date|         closed_date|       deletion_date|score|owner_userid|answer_count|
+---+-------------------+--------------------+--------------------+-----+------------+------------+
|  1|2008-08-01 02:56:37|                  NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-08-01 03:12:52|                  NA|                  NA|  472|           8|          13|
|  6|2008-08-01 03:38:08|                  NA|                  NA|  210|           9|           5|
|  8|2008-08-01 05:03:19|2013-06-03T04:00:25Z|2015-02-11T08:26:40Z|   42|          NA|           8|
|  9|2008-08-01 05:10:59|                  NA|                  NA| 1452|           1|          58|
+---+-------------------+--------------------+--------------------+-----+------------+------------+
only showing top 5 rows



### cast columns to data types

In [8]:
val dfQuestions = dfQuestionsCSV.select(
    dfQuestionsCSV.col("id").cast("integer"),
    dfQuestionsCSV.col("creation_date").cast("timestamp"),
    dfQuestionsCSV.col("closed_date").cast("timestamp"),
    dfQuestionsCSV.col("deletion_date").cast("date"),
    dfQuestionsCSV.col("score").cast("integer"),
    dfQuestionsCSV.col("owner_userid").cast("integer"),
    dfQuestionsCSV.col("answer_count").cast("integer")
  )

dfQuestions: org.apache.spark.sql.DataFrame = [id: int, creation_date: timestamp ... 5 more fields]


In [9]:
dfQuestions.printSchema

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



### Imports for functions

In [10]:
import org.apache.spark.sql.functions.{avg,max,min,mean,sum}

import org.apache.spark.sql.functions.{avg, max, min, mean, sum}


### Average

In [11]:
dfQuestions.select(avg("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



### Maximum

In [12]:
dfQuestions.select(max("score")).show()

+----------+
|max(score)|
+----------+
|      4443|
+----------+



### Minimum

In [13]:
dfQuestions.select(min("score")).show()

+----------+
|min(score)|
+----------+
|       -27|
+----------+



### Mean

In [14]:
dfQuestions.select(mean("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



### Sum

In [15]:
dfQuestions.select(sum("score")).show()

+----------+
|sum(score)|
+----------+
|    361427|
+----------+



### Count

In [16]:
dfQuestions.select(count("score")).show()

+------------+
|count(score)|
+------------+
|        9999|
+------------+



### Group by with statistics

In [17]:
dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags, dfQuestions.col("id").equalTo(dfTags("id")))
    .groupBy(dfQuestions.col("owner_userid"))
    .agg(avg("score"), max("answer_count"))
    .show()

+------------+----------+-----------------+
|owner_userid|avg(score)|max(answer_count)|
+------------+----------+-----------------+
|         268|      26.0|                1|
|         136|      57.6|                9|
|         123|      20.0|                3|
+------------+----------+-----------------+



### DataFrame Statistics using describe() method

In [18]:
val dfQuestionsStatistics = dfQuestions.describe()

dfQuestionsStatistics: org.apache.spark.sql.DataFrame = [summary: string, id: string ... 3 more fields]


In [19]:
dfQuestionsStatistics.show()

+-------+-----------------+------------------+-----------------+------------------+
|summary|               id|             score|     owner_userid|      answer_count|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             9999|              9999|             7388|              9922|
|   mean|33929.17081708171| 36.14631463146315|47389.99472116947|6.6232614392259626|
| stddev|19110.09560532429|160.48316753972045|280943.1070344427| 9.069109116851138|
|    min|                1|               -27|                1|                -5|
|    max|            66037|              4443|          3431280|               316|
+-------+-----------------+------------------+-----------------+------------------+



### Correlation

In [20]:
val correlation = dfQuestions.stat.corr("score", "answer_count")

correlation: Double = 0.3699847903294707


In [21]:
println(s"correlation between column score and answer_count = $correlation")

correlation between column score and answer_count = 0.3699847903294707


### Covariance

In [22]:
val covariance = dfQuestions.stat.cov("score", "answer_count")

covariance: Double = 537.513381444165


In [23]:
println(s"covariance between column score and answer_count = $covariance")

covariance between column score and answer_count = 537.513381444165


### Frequent Items

In [24]:
val dfFrequentScore = dfQuestions.stat.freqItems(Seq("answer_count"))

dfFrequentScore: org.apache.spark.sql.DataFrame = [answer_count_freqItems: array<int>]


In [26]:
dfFrequentScore.show(false)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|answer_count_freqItems                                                                                                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Crosstab

In [27]:
val dfScoreByUserid = dfQuestions
    .filter("owner_userid > 0 and owner_userid < 20")
    .stat
    .crosstab("score", "owner_userid")

dfScoreByUserid: org.apache.spark.sql.DataFrame = [score_owner_userid: string, 1: bigint ... 9 more fields]


In [28]:
dfScoreByUserid.show(5)

+------------------+---+---+---+---+---+---+---+---+---+---+
|score_owner_userid|  1| 11| 13| 17|  2|  3|  4|  5|  8|  9|
+------------------+---+---+---+---+---+---+---+---+---+---+
|                56|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|               472|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|                14|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|
|                20|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|               179|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|
+------------------+---+---+---+---+---+---+---+---+---+---+
only showing top 5 rows



### Stratified sampling using sampleBy

##### find all rows where answer_count in (5, 10, 20)

In [29]:
val dfQuestionsByAnswerCount = dfQuestions
    .filter("owner_userid > 0")
    .filter("answer_count in (5, 10, 20)")

dfQuestionsByAnswerCount: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, creation_date: timestamp ... 5 more fields]


In [31]:
dfQuestionsByAnswerCount.head()

res17: org.apache.spark.sql.Row = [6,2008-08-01 03:38:08.0,null,null,210,9,5]


##### count how many rows match answer_count in (5, 10, 20)

In [32]:
dfQuestionsByAnswerCount
    .groupBy("answer_count")
    .count()
    .show()

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  811|
|          10|  272|
+------------+-----+



**Create a fraction map where we are only interested:**
- 50% of the rows that have answer_count = 5
- 10% of the rows that have answer_count = 10
- 100% of the rows that have answer_count = 20

*Note also that fractions should be in the range [0, 1]*

In [33]:
val fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

fractionKeyMap: scala.collection.immutable.Map[Int,Double] = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)


#### Stratified sample using the fractionKeyMap.

In [34]:
dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, seed=7L)
    .groupBy("answer_count")
    .count()
    .show()

2020-01-19 22:56:19 WARN  BaseSessionStateBuilder$$anon$2:66 - Max iterations (100) reached for batch Operator Optimization before Inferring Filters
2020-01-19 22:56:20 WARN  BaseSessionStateBuilder$$anon$2:66 - Max iterations (100) reached for batch Operator Optimization after Inferring Filters
+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  400|
|          10|   26|
+------------+-----+



Note that changing the random seed will modify our sampling outcome. As an example, let's change the random seed to 37.

In [35]:
dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, seed=37L)
    .groupBy("answer_count")
    .count()
    .show()

2020-01-19 22:58:32 WARN  BaseSessionStateBuilder$$anon$2:66 - Max iterations (100) reached for batch Operator Optimization before Inferring Filters
2020-01-19 22:58:32 WARN  BaseSessionStateBuilder$$anon$2:66 - Max iterations (100) reached for batch Operator Optimization after Inferring Filters
+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  388|
|          10|   25|
+------------+-----+



### Approximate Quantile

In [36]:
val quantiles = dfQuestions
    .stat
    .approxQuantile("score", Array(0, 0.5, 1), 0.25)

quantiles: Array[Double] = Array(-27.0, 2.0, 4443.0)


In [37]:
println(s"Qauntiles segments = ${quantiles.toSeq}")

Qauntiles segments = WrappedArray(-27.0, 2.0, 4443.0)


### We can verify the quantiles statistics above using Spark SQL as follows:

In [38]:
dfQuestions.createOrReplaceTempView("so_questions")

In [39]:
spark.sql("select min(score), percentile_approx(score, 0.25), max(score) from so_questions").show()

+----------+-----------------------------------------------------+----------+
|min(score)|percentile_approx(score, CAST(0.25 AS DOUBLE), 10000)|max(score)|
+----------+-----------------------------------------------------+----------+
|       -27|                                                    2|      4443|
+----------+-----------------------------------------------------+----------+



### Bloom Filter

In [41]:
val tagsBloomFilter = dfTags.stat.bloomFilter(colName="tag", expectedNumItems=1000L, fpp=0.1)

tagsBloomFilter: org.apache.spark.util.sketch.BloomFilter = org.apache.spark.util.sketch.BloomFilterImpl@809c4023


In [42]:
println(s"bloom filter contains java tag = ${tagsBloomFilter.mightContain("java")}")

bloom filter contains java tag = true


In [43]:
println(s"bloom filter contains some unknown tag = ${tagsBloomFilter.mightContain("unknown tag")}")

bloom filter contains some unknown tag = false


### Count Min Sketch

In [44]:
val cmsTag = dfTags.stat.countMinSketch(colName="tag", eps=0.1, confidence=0.9, seed=37)

cmsTag: org.apache.spark.util.sketch.CountMinSketch = org.apache.spark.util.sketch.CountMinSketchImpl@431a88ed


In [45]:
val estimatedFrequency = cmsTag.estimateCount("java")

estimatedFrequency: Long = 513


In [46]:
println(s"Estimated frequency for tag java = $estimatedFrequency")

Estimated frequency for tag java = 513


### Sampling With Replacement

In [47]:
val dfTagsSample = dfTags.sample(withReplacement=true, fraction=0.2, seed=37L)

dfTagsSample: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: int, tag: string]


In [48]:
println(s"Number of rows in sample dfTagsSample = ${dfTagsSample.count()}")

Number of rows in sample dfTagsSample = 1948


In [49]:
println(s"Number of rows in dfTags = ${dfTags.count()}")

Number of rows in dfTags = 9999


### Closing Spark Session

In [50]:
spark.stop()