### Initialize pyspark

In [1]:
import findspark
findspark.init()
import pyspark

### Initialize and create a spark session

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

### Reading JSON file as a dataframe

In [3]:
tagsDF = spark.read.options(multiLine=True, inferSchema=True).json("..\\Resources\\tags_sample.json")

### Show

In [5]:
tagsDF.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------+
|stackoverflow                                                                                                                                 |
+----------------------------------------------------------------------------------------------------------------------------------------------+
|[[[Martin Odersky, [[1, Play Framework], [2, Akka Framework]], 1, scala]], [[James Gosling, [[1, Apache Tomcat], [2, Spring Boot]], 2, java]]]|
+----------------------------------------------------------------------------------------------------------------------------------------------+



### Collect

In [6]:
tagsDF.collect()

[Row(stackoverflow=[Row(tag=Row(author='Martin Odersky', frameworks=[Row(id=1, name='Play Framework'), Row(id=2, name='Akka Framework')], id=1, name='scala')), Row(tag=Row(author='James Gosling', frameworks=[Row(id=1, name='Apache Tomcat'), Row(id=2, name='Spring Boot')], id=2, name='java'))])]

### Schema

In [8]:
tagsDF.printSchema()

root
 |-- stackoverflow: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- tag: struct (nullable = true)
 |    |    |    |-- author: string (nullable = true)
 |    |    |    |-- frameworks: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)



### Explode

In [9]:
from pyspark.sql.functions import explode

In [17]:
df = tagsDF.select(explode("stackoverflow").alias("stackoverflow_tags"))

### Show

In [18]:
df.show(truncate=False)

+------------------------------------------------------------------------+
|stackoverflow_tags                                                      |
+------------------------------------------------------------------------+
|[[Martin Odersky, [[1, Play Framework], [2, Akka Framework]], 1, scala]]|
|[[James Gosling, [[1, Apache Tomcat], [2, Spring Boot]], 2, java]]      |
+------------------------------------------------------------------------+



### Schema

In [19]:
df.printSchema()

root
 |-- stackoverflow_tags: struct (nullable = true)
 |    |-- tag: struct (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- frameworks: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)



In [27]:
from pyspark.sql.functions import col

In [30]:
df.selectExpr(
    "stackoverflow_tags.tag.id as id",
    "stackoverflow_tags.tag.author as author",
    "stackoverflow_tags.tag.name as tag_name",
    "stackoverflow_tags.tag.frameworks.id as frameworks_id",
    "stackoverflow_tags.tag.frameworks.name as frameworks_name"
  ).show(truncate=False)

+---+--------------+--------+-------------+--------------------------------+
|id |author        |tag_name|frameworks_id|frameworks_name                 |
+---+--------------+--------+-------------+--------------------------------+
|1  |Martin Odersky|scala   |[1, 2]       |[Play Framework, Akka Framework]|
|2  |James Gosling |java    |[1, 2]       |[Apache Tomcat, Spring Boot]    |
+---+--------------+--------+-------------+--------------------------------+



Or

In [31]:
df.select(
    col("stackoverflow_tags.tag.id").alias("id"),
    col("stackoverflow_tags.tag.author").alias("author"),
    col("stackoverflow_tags.tag.name").alias("tag_name"),
    col("stackoverflow_tags.tag.frameworks.id").alias("frameworks_id"),
    col("stackoverflow_tags.tag.frameworks.name").alias("frameworks_name"),
  ).show(truncate=False)

+---+--------------+--------+-------------+--------------------------------+
|id |author        |tag_name|frameworks_id|frameworks_name                 |
+---+--------------+--------+-------------+--------------------------------+
|1  |Martin Odersky|scala   |[1, 2]       |[Play Framework, Akka Framework]|
|2  |James Gosling |java    |[1, 2]       |[Apache Tomcat, Spring Boot]    |
+---+--------------+--------+-------------+--------------------------------+



### Closing Spark Session

In [32]:
spark.stop()