### Initialize pyspark

In [1]:
import findspark
findspark.init()
import pyspark

### Initialize and create a spark session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Data

In [9]:
tagsDF = (spark.read
              .options(multiLine=True, inferSchema=True)
              .json("..\\Resources\\tags_sample.json"))

### Show

In [11]:
tagsDF.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+
|stackoverflow                                                                                                                                 |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|[[[Martin Odersky, [[1, Play Framework], [2, Akka Framework]], 1, scala]], [[James Gosling, [[1, Apache Tomcat], [2, Spring Boot]], 2, java]]]|
+----------------------------------------------------------------------------------------------------------------------------------------------+



### Explode

In [31]:
from pyspark.sql.functions import explode,col

In [34]:
tagsDF.printSchema()

root
 |-- stackoverflow: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- tag: struct (nullable = true)
 |    |    |    |-- author: string (nullable = true)
 |    |    |    |-- frameworks: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)



In [45]:
(tagsDF.select(explode("stackoverflow").alias("stackoverflow_tags"))).show(truncate=False)

+------------------------------------------------------------------------+
|stackoverflow_tags                                                      |
+------------------------------------------------------------------------+
|[[Martin Odersky, [[1, Play Framework], [2, Akka Framework]], 1, scala]]|
|[[James Gosling, [[1, Apache Tomcat], [2, Spring Boot]], 2, java]]      |
+------------------------------------------------------------------------+



In [50]:
df = (tagsDF
          .select(explode("stackoverflow").alias("stackoverflow_tags"))
          .select(
              col("stackoverflow_tags.tag.id").alias("id"),
              col("stackoverflow_tags.tag.author").alias("author"),
              col("stackoverflow_tags.tag.name").alias("tag_name"),
              col("stackoverflow_tags.tag.frameworks.id").alias("frameworks_id"),
              col("stackoverflow_tags.tag.frameworks.name").alias("frameworks_name")
          )
     )

In [51]:
df.show(truncate=False)

+---+--------------+--------+-------------+--------------------------------+
|id |author        |tag_name|frameworks_id|frameworks_name                 |
+---+--------------+--------+-------------+--------------------------------+
|1  |Martin Odersky|scala   |[1, 2]       |[Play Framework, Akka Framework]|
|2  |James Gosling |java    |[1, 2]       |[Apache Tomcat, Spring Boot]    |
+---+--------------+--------+-------------+--------------------------------+



### array_contains

In [52]:
from pyspark.sql.functions import array_contains

In [53]:
(df.select("*")
  .where(array_contains("frameworks_name", "Play Framework"))
  .show(truncate=False))

+---+--------------+--------+-------------+--------------------------------+
|id |author        |tag_name|frameworks_id|frameworks_name                 |
+---+--------------+--------+-------------+--------------------------------+
|1  |Martin Odersky|scala   |[1, 2]       |[Play Framework, Akka Framework]|
+---+--------------+--------+-------------+--------------------------------+



### Closing Spark Session

In [54]:
spark.stop()