### Initialize pyspark

In [1]:
import findspark
findspark.init()
import pyspark

### Initialize and create a spark session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Create a dataframe from tags file question_tags_10K.csv

In [4]:
dfTags = (spark
    .read
    .options(heade=True, inferSchema=True)
    .csv("..\\Resources\\question_tags_10K.csv")
    .toDF("id", "tag"))

In [5]:
dfTags.show(5)

+---+---------------+
| id|            tag|
+---+---------------+
| Id|            Tag|
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
+---+---------------+
only showing top 5 rows



### Create a dataframe from questions file questions_10K.csv

In [7]:
dfQuestionsCSV = (spark
                    .read
                    .options(header=True, inferSchema=True, dateFormat='yyyy-MM-dd HH:mm:ss')
                    .csv("..\\Resources\\questions_10K.csv")
                    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")
                 )

In [8]:
dfQuestionsCSV.show(5)

+---+-------------------+--------------------+--------------------+-----+------------+------------+
| id|      creation_date|         closed_date|       deletion_date|score|owner_userid|answer_count|
+---+-------------------+--------------------+--------------------+-----+------------+------------+
|  1|2008-08-01 02:56:37|                  NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-08-01 03:12:52|                  NA|                  NA|  472|           8|          13|
|  6|2008-08-01 03:38:08|                  NA|                  NA|  210|           9|           5|
|  8|2008-08-01 05:03:19|2013-06-03T04:00:25Z|2015-02-11T08:26:40Z|   42|          NA|           8|
|  9|2008-08-01 05:10:59|                  NA|                  NA| 1452|           1|          58|
+---+-------------------+--------------------+--------------------+-----+------------+------------+
only showing top 5 rows



### cast columns to data types

In [9]:
from pyspark.sql.functions import col

In [10]:
dfQuestions = dfQuestionsCSV.select(
    col("id").cast("integer"),
    col("creation_date").cast("timestamp"),
    col("closed_date").cast("timestamp"),
    col("deletion_date").cast("date"),
    col("score").cast("integer"),
    col("owner_userid").cast("integer"),
    col("answer_count").cast("integer")
  )

In [11]:
dfQuestions.printSchema()

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



### Imports for functions

In [20]:
from pyspark.sql.functions import avg,max,min,mean,sum,count

### Average

In [13]:
dfQuestions.select(avg("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



### Maximum

In [14]:
dfQuestions.select(max("score")).show()

+----------+
|max(score)|
+----------+
|      4443|
+----------+



### Minimum

In [15]:
dfQuestions.select(min("score")).show()

+----------+
|min(score)|
+----------+
|       -27|
+----------+



### Mean

In [16]:
dfQuestions.select(mean("score")).show()

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



### Sum

In [18]:
dfQuestions.select(sum("score")).show()

+----------+
|sum(score)|
+----------+
|    361427|
+----------+



### Count

In [21]:
dfQuestions.select(count("score")).show()

+------------+
|count(score)|
+------------+
|        9999|
+------------+



### Group by with statistics

In [27]:
(dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags, "id")
    .groupBy(col("owner_userid"))
    .agg(avg("score"), max("answer_count"))
    .show())

+------------+----------+-----------------+
|owner_userid|avg(score)|max(answer_count)|
+------------+----------+-----------------+
|         268|      26.0|                1|
|         136|      57.6|                9|
|         123|      20.0|                3|
+------------+----------+-----------------+



### DataFrame Statistics using describe() method

In [28]:
dfQuestionsStatistics = dfQuestions.describe()

In [29]:
dfQuestionsStatistics.show()

+-------+-----------------+------------------+-----------------+------------------+
|summary|               id|             score|     owner_userid|      answer_count|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             9999|              9999|             7388|              9922|
|   mean|33929.17081708171| 36.14631463146315|47389.99472116947|6.6232614392259626|
| stddev|19110.09560532429|160.48316753972045|280943.1070344427| 9.069109116851138|
|    min|                1|               -27|                1|                -5|
|    max|            66037|              4443|          3431280|               316|
+-------+-----------------+------------------+-----------------+------------------+



### Correlation

In [30]:
correlation = dfQuestions.stat.corr("score", "answer_count")

In [31]:
print("correlation between column score and answer_count = {}".format(correlation))

correlation between column score and answer_count = 0.3699847903294707


### Covariance

In [32]:
covariance = dfQuestions.stat.cov("score", "answer_count")

In [33]:
print("covariance between column score and answer_count = {}".format(covariance))

covariance between column score and answer_count = 537.513381444165


### Frequent Items

In [35]:
dfFrequentScore = dfQuestions.stat.freqItems(["answer_count"])

In [37]:
dfFrequentScore.show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|answer_count_freqItems                                                                                                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Crosstab

In [39]:
dfScoreByUserid = (dfQuestions
                    .filter("owner_userid > 0 and owner_userid < 20")
                    .stat
                    .crosstab("score", "owner_userid")
                  )

In [40]:
dfScoreByUserid.show(5)

+------------------+---+---+---+---+---+---+---+---+---+---+
|score_owner_userid|  1| 11| 13| 17|  2|  3|  4|  5|  8|  9|
+------------------+---+---+---+---+---+---+---+---+---+---+
|                56|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|               472|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|                14|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|
|                20|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|               179|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|
+------------------+---+---+---+---+---+---+---+---+---+---+
only showing top 5 rows



### Stratified sampling using sampleBy

In [41]:
dfQuestionsByAnswerCount = (dfQuestions
                            .filter("owner_userid > 0")
                            .filter("answer_count in (5, 10, 20)")
                           )

In [42]:
dfQuestionsByAnswerCount.head()

Row(id=6, creation_date=datetime.datetime(2008, 8, 1, 3, 38, 8), closed_date=None, deletion_date=None, score=210, owner_userid=9, answer_count=5)

##### count how many rows match answer_count in (5, 10, 20)

In [44]:
(dfQuestionsByAnswerCount
    .groupBy("answer_count")
    .count()
    .show())

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  811|
|          10|  272|
+------------+-----+



**Create a fraction map where we are only interested:**

- 50% of the rows that have answer_count = 5
- 10% of the rows that have answer_count = 10
- 100% of the rows that have answer_count = 20

*Note also that fractions should be in the range [0, 1]*

In [45]:
fractionKeyMap = {5 : 0.5, 10 : 0.1, 20 : 1.0}

### Stratified sample using the fractionKeyMap

In [49]:
(dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, seed=7)
    .groupBy("answer_count")
    .count()
    .show())

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  400|
|          10|   26|
+------------+-----+



Note that changing the random seed will modify our sampling outcome. As an example, let's change the random seed to 37.

In [50]:
(dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, seed=37)
    .groupBy("answer_count")
    .count()
    .show())

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  388|
|          10|   25|
+------------+-----+



### Approximate Quantile

In [58]:
quantiles = (dfQuestions
                .stat
                .approxQuantile("score", [0.0, 0.5, 1.0], 0.25)
            )

In [60]:
print("Qauntiles segments = {}".format(quantiles))

Qauntiles segments = [-27.0, 2.0, 4443.0]


### We can verify the quantiles statistics above using Spark SQL as follows:

In [61]:
dfQuestions.createOrReplaceTempView("so_questions")

In [62]:
spark.sql("select min(score), percentile_approx(score, 0.25), max(score) from so_questions").show()

+----------+-----------------------------------------------------+----------+
|min(score)|percentile_approx(score, CAST(0.25 AS DOUBLE), 10000)|max(score)|
+----------+-----------------------------------------------------+----------+
|       -27|                                                    2|      4443|
+----------+-----------------------------------------------------+----------+



### Sampling With Replacement

In [67]:
dfTagsSample = dfTags.sample(withReplacement=True, fraction=0.2, seed=37)

In [68]:
print("Number of rows in sample dfTagsSample = {}".format(dfTagsSample.count()))

Number of rows in sample dfTagsSample = 1948


In [69]:
print("Number of rows in dfTags = {}".format(dfTags.count()))

Number of rows in dfTags = 10000


### Closing Spark Session

In [70]:
spark.stop()