### Initialize pyspark

In [1]:
import findspark
findspark.init()
import pyspark

### Initialize and create a spark session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Reading Data

In [3]:
dfTags = (spark
            .read
            .options(header=True, inferSchema=True)
            .csv("..\\Resources\\question_tags_10K.csv")
            .toDF("id", "tag"))

In [4]:
dfTags.show(5)

+---+---------------+
| id|            tag|
+---+---------------+
|  1|           data|
|  4|             c#|
|  4|       winforms|
|  4|type-conversion|
|  4|        decimal|
+---+---------------+
only showing top 5 rows



In [5]:
dfQuestionsCSV = (spark
                    .read
                    .options(header=False, inferSchema=True, dateFormat="yyyy-MM-dd HH:mm:ss")
                    .csv("..\\resources\\questions_10K.csv")
                    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count"))

In [6]:
dfQuestionsCSV.show(5)

+---+--------------------+--------------------+--------------------+-----+------------+------------+
| id|       creation_date|         closed_date|       deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+--------------------+-----+------------+------------+
| Id|        CreationDate|          ClosedDate|        DeletionDate|Score| OwnerUserId| AnswerCount|
|  1|2008-07-31T21:26:37Z|                  NA|2011-03-28T00:53:47Z|    1|          NA|           0|
|  4|2008-07-31T21:42:52Z|                  NA|                  NA|  472|           8|          13|
|  6|2008-07-31T22:08:08Z|                  NA|                  NA|  210|           9|           5|
|  8|2008-07-31T23:33:19Z|2013-06-03T04:00:25Z|2015-02-11T08:26:40Z|   42|          NA|           8|
+---+--------------------+--------------------+--------------------+-----+------------+------------+
only showing top 5 rows



In [8]:
dfQuestions = (dfQuestionsCSV
                .filter("score > 400 and score < 410")
                .join(dfTags, "id")
                .select("owner_userid", "tag", "creation_date", "score"))

In [9]:
dfQuestions.show(5)

+------------+---------+--------------------+-----+
|owner_userid|      tag|       creation_date|score|
+------------+---------+--------------------+-----+
|         131|   xdebug|2008-08-03T23:18:21Z|  405|
|         131| phpstorm|2008-08-03T23:18:21Z|  405|
|         131|debugging|2008-08-03T23:18:21Z|  405|
|         131|  eclipse|2008-08-03T23:18:21Z|  405|
|         131|      php|2008-08-03T23:18:21Z|  405|
+------------+---------+--------------------+-----+
only showing top 5 rows



In [23]:
dfTagsOfTag = dfTags.select("id","tag")

In [28]:
for tag in dfTagsOfTag.take(5):
    print("id = {}, tag={}".format(tag['id'],tag['tag']))

id = 1, tag=data
id = 4, tag=c#
id = 4, tag=winforms
id = 4, tag=type-conversion
id = 4, tag=decimal


### Create DataFrame from collection

In [30]:
seqTags = [(1,"so_java"), (1,"so_jsp"), (2,"so_golang"), (3,"so_scala"), (3,"so_akka")]

In [32]:
dfMoreTags = spark.createDataFrame(seqTags).toDF("id", "tag")

In [33]:
dfMoreTags.show()

+---+---------+
| id|      tag|
+---+---------+
|  1|  so_java|
|  1|   so_jsp|
|  2|so_golang|
|  3| so_scala|
|  3|  so_akka|
+---+---------+



### DataFrame Union

In [34]:
dfTags.show(2)

+---+----+
| id| tag|
+---+----+
|  1|data|
|  4|  c#|
+---+----+
only showing top 2 rows



In [35]:
dfMoreTags.show(2)

+---+-------+
| id|    tag|
+---+-------+
|  1|so_java|
|  1| so_jsp|
+---+-------+
only showing top 2 rows



In [36]:
dfUnionOfTags = dfTags.union(dfMoreTags).filter("id in (1,3)")

### DataFrame Intersection

In [37]:
dfIntersectionTags = dfMoreTags.intersect(dfUnionOfTags)

In [38]:
dfIntersectionTags.show(5)

+---+--------+
| id|     tag|
+---+--------+
|  1|  so_jsp|
|  3|so_scala|
|  3| so_akka|
|  1| so_java|
+---+--------+



### Append column to DataFrame using withColumn()

In [39]:
from pyspark.sql.functions import split

In [53]:
dfSplitColumn = (dfMoreTags.withColumn("tmp", split("tag", "_"))
                              .select("id","tag",col("tmp")[0].alias("so_prefix"),col("tmp")[1].alias("so_tag"))
                              .drop("tmp"))

In [54]:
dfSplitColumn.show(5)

+---+---------+---------+------+
| id|      tag|so_prefix|so_tag|
+---+---------+---------+------+
|  1|  so_java|       so|  java|
|  1|   so_jsp|       so|   jsp|
|  2|so_golang|       so|golang|
|  3| so_scala|       so| scala|
|  3|  so_akka|       so|  akka|
+---+---------+---------+------+



### Closing Spark Session

In [55]:
spark.stop()