# Multidimensional data frames: Using PySpark with JSON data
This chapter covers
- Drawing parallels between JSON documents and Python data structures
- Ingesting JSON data within a data frame
- Representing hierarchical data in a data frame through complex column types
- Reducing duplication and reliance on auxiliary tables with a document/hierarchical data model
- Creating and unpacking data from complex data types

## Start a spark session

In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

# change the account name to your email account
account='sli'

# define a root path to access the data in the DataAnalysisWithPythonAndPySpark
root_path='/net/clusterhn/home/'+account+'/isa460/Data/'

# check if the Spark session is active. If it is activate, close it

try:
    if spark:
        spark.stop()
except:
    pass    

spark = (SparkSession.builder.appName("Multidimensional Data Frame")
        .config("spark.port.maxRetries", "100")
        .getOrCreate())

# confiture the log level (defaulty is WWARN)
spark.sparkContext.setLogLevel('ERROR')

![json data as a limited python dictionary](https://raw.githubusercontent.com/Suhong88/ISA460_Fall2023/main/images/Figure%206.1.png)

In [7]:
# load a json documnet as dictionary in python
import json                        
 
sample_json = """{
  "id": 143,
  "name": "Silicon Valley",
  "type": "Scripted",
  "language": "English",
  "genres": [
    "Comedy"
  ],
  "network": {
    "id": 8,
    "name": "HBO",
    "country": {
      "name": "United States",
      "code": "US",
      "timezone": "America/New_York"
    }
  }
}"""

document=json.loads(sample_json)
print(document)

print(type(document))

{'id': 143, 'name': 'Silicon Valley', 'type': 'Scripted', 'language': 'English', 'genres': ['Comedy'], 'network': {'id': 8, 'name': 'HBO', 'country': {'name': 'United States', 'code': 'US', 'timezone': 'America/New_York'}}}
<class 'dict'>


## Reading JSON data in PySpark

In [12]:
shows=spark.read.json(root_path+'shows/shows-silicon-valley.json')

In [13]:
shows.count()

1

In [14]:
shows.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [17]:
# read multiple json files

three_shows=spark.read.json(root_path+'shows/shows-*.json', multiLine=True)

In [18]:
three_shows.count()

3

In [19]:
assert three_shows.count()==3

In [21]:
# look at the top columns of a json file
print(shows.columns)

['_embedded', '_links', 'externals', 'genres', 'id', 'image', 'language', 'name', 'network', 'officialSite', 'premiered', 'rating', 'runtime', 'schedule', 'status', 'summary', 'type', 'updated', 'url', 'webChannel', 'weight']


### Work with array

In [26]:
# Genres is stored as an array
three_shows.select(F.col('name'), F.col('genres')).show(5, False)

+----------------+------------------------+
|name            |genres                  |
+----------------+------------------------+
|The Golden Girls|[Drama, Comedy]         |
|Breaking Bad    |[Drama, Crime, Thriller]|
|Silicon Valley  |[Comedy]                |
+----------------+------------------------+



In [30]:
# extract element of an array

three_shows.select(F.col('genres')[0], F.col('genres').getItem(0)).show(5, False)

+---------+---------+
|genres[0]|genres[0]|
+---------+---------+
|Drama    |Drama    |
|Drama    |Drama    |
|Comedy   |Comedy   |
+---------+---------+



In [38]:
# extract element of an array and store it at a new column

three_shows1=(three_shows.withColumn('genre1', F.col('genres')[0]).withColumn('genre2', F.col('genres')[1]).
              withColumn('genre3', F.col('genres')[2]))

three_shows1.select('genre1', 'genre2', 'genre3').show()

+------+------+--------+
|genre1|genre2|  genre3|
+------+------+--------+
| Drama|Comedy|    null|
| Drama| Crime|Thriller|
|Comedy|  null|    null|
+------+------+--------+



In [63]:
# array function: size(), array(), array_distinct(), array_intersect(), array_repeat(), array_position()

three_shows2=three_shows1.select('genre1', 'genre2', 'genre3')

(three_shows2.select(F.array('genre1', 'genre2', 'genre3').alias('combined_genres'),
                     F.array_repeat('combined_genres', 3).alias('repeated_genres'),
                     F.array_distinct('repeated_genres').alias('genres_norepeat'),
                     F.size(F.col('genres_norepeat')[0]).alias('array_size'),
                     F.array_position('combined_genres','Comedy').alias('comedy_position')
                    )                                                                 
                                                                 .show(5))


+--------------------+--------------------+--------------------+----------+---------------+
|     combined_genres|     repeated_genres|     genres_norepeat|array_size|comedy_position|
+--------------------+--------------------+--------------------+----------+---------------+
|[Drama, Comedy, n...|[[Drama, Comedy, ...|[[Drama, Comedy, ...|         3|              2|
|[Drama, Crime, Th...|[[Drama, Crime, T...|[[Drama, Crime, T...|         3|              0|
|[Comedy, null, null]|[[Comedy, null, n...|[[Comedy, null, n...|         3|              1|
+--------------------+--------------------+--------------------+----------+---------------+



### The Map Type: Keys and Values within a column

In [64]:
three_shows.columns

['_embedded',
 '_links',
 'externals',
 'genres',
 'id',
 'image',
 'language',
 'name',
 'network',
 'officialSite',
 'premiered',
 'rating',
 'runtime',
 'schedule',
 'status',
 'summary',
 'type',
 'updated',
 'url',
 'webChannel',
 'weight']

In [82]:
columns=['name','language', 'type']

shows_map1=three_shows.select(*[F.lit(column) for column in columns], F.array(*columns).alias('values'))

#shows_map1.show(3, False)

shows_map2=shows_map1.select(F.array(*columns).alias('keys'), 'values')

shows_map2.show(3, False)

+----------------------+-------------------------------------+
|keys                  |values                               |
+----------------------+-------------------------------------+
|[name, language, type]|[The Golden Girls, English, Scripted]|
|[name, language, type]|[Breaking Bad, English, Scripted]    |
|[name, language, type]|[Silicon Valley, English, Scripted]  |
+----------------------+-------------------------------------+



In [88]:
# create a map based on two arrays

shows_map3=shows_map2.select(F.map_from_arrays("keys", "values").alias("mapped"))

shows_map3.show(3, False)

+-----------------------------------------------------------------+
|mapped                                                           |
+-----------------------------------------------------------------+
|{name -> The Golden Girls, language -> English, type -> Scripted}|
|{name -> Breaking Bad, language -> English, type -> Scripted}    |
|{name -> Silicon Valley, language -> English, type -> Scripted}  |
+-----------------------------------------------------------------+



In [89]:
shows_map3.printSchema()

root
 |-- mapped: map (nullable = false)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [98]:
# Three ways to access map value. Functions associated with map include map_value(), create_map()

shows_map3.select(F.col('mapped.name'), F.col('mapped')["name"], shows_map3.mapped['name']).show()

+----------------+----------------+----------------+
|            name|    mapped[name]|    mapped[name]|
+----------------+----------------+----------------+
|The Golden Girls|The Golden Girls|The Golden Girls|
|    Breaking Bad|    Breaking Bad|    Breaking Bad|
|  Silicon Valley|  Silicon Valley|  Silicon Valley|
+----------------+----------------+----------------+



### the Struct: Nesting columns within columns

The struct is very different from the array and the map in that the number of fields and their names are known ahead of time. In our case, the schedule struct column is fixed: we know that each record of our data frame will contain that schedule struct (or a null value, if we want to be pedantic), and within that struct there will be an array of strings, days, and a string, time. 

In [99]:
# The schedule column is a struct. It includes two columns: days and time

three_shows.select("schedule").printSchema()

root
 |-- schedule: struct (nullable = true)
 |    |-- days: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- time: string (nullable = true)



In [100]:
# look at thee _embedded column

three_shows.select(F.col("_embedded")).printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: string (nullable = true)
 |    |    |    |-- airstamp: string (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = true

In [106]:
# extract episodes

shows_clean=three_shows.withColumn("episodes", F.col('_embedded.episodes')).drop('_embedded')

shows_clean.printSchema()


root
 |-- _links: struct (nullable = true)
 |    |-- previousepisode: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |    |-- self: struct (nullable = true)
 |    |    |-- href: string (nullable = true)
 |-- externals: struct (nullable = true)
 |    |-- imdb: string (nullable = true)
 |    |-- thetvdb: long (nullable = true)
 |    |-- tvrage: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: long (nullable = true)
 |-- image: struct (nullable = true)
 |    |-- medium: string (nullable = true)
 |    |-- original: string (nullable = true)
 |-- language: string (nullable = true)
 |-- name: string (nullable = true)
 |-- network: struct (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- timezone: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nul

In [117]:
# extract an field from a stuct

shows_clean.select(F.col('episodes.name')).show(3)


+--------------------+
|                name|
+--------------------+
|[The Engagement, ...|
|[Pilot, Cat's in ...|
|[Minimum Viable P...|
+--------------------+



In [119]:
# explode an array

shows_clean.select(F.explode(F.col('episodes.name'))).show(10, False)


+----------------------------------+
|col                               |
+----------------------------------+
|The Engagement                    |
|Guess Who's Coming to the Wedding?|
|Rose the Prude                    |
|Transplant                        |
|The Triangle                      |
|On Golden Girls                   |
|The Competition                   |
|Break-In                          |
|Blanche and the Younger Man       |
|Heart Attack                      |
+----------------------------------+
only showing top 10 rows



### Building and Using the data frame schema

In [120]:
import pyspark.sql.types as T
 
episode_links_schema = T.StructType(
    [
        T.StructField(
            "self", T.StructType([T.StructField("href", T.StringType())]) 
        )
    ]
)  
  
episode_image_schema = T.StructType(
    [
        T.StructField("medium", T.StringType()),                        
        T.StructField("original", T.StringType()),                       
    ]
)  
  
episode_schema = T.StructType(
    [
        T.StructField("_links", episode_links_schema),                    
        T.StructField("airdate", T.DateType()),
        T.StructField("airstamp", T.TimestampType()),
        T.StructField("airtime", T.StringType()),
        T.StructField("id", T.StringType()),
        T.StructField("image", episode_image_schema),                    
        T.StructField("name", T.StringType()),
        T.StructField("number", T.LongType()),
        T.StructField("runtime", T.LongType()),
        T.StructField("season", T.LongType()),
        T.StructField("summary", T.StringType()),
        T.StructField("url", T.StringType()),
    ]
)
 
embedded_schema = T.StructType(
    [
        T.StructField(
            "_embedded",
            T.StructType(
                [
                    T.StructField(
                        "episodes", T.ArrayType(episode_schema)          
                    )
                ]
            ),
        )
    ]
)

In [122]:
# read the show data with defined schema

shows_with_schema=spark.read.json(root_path+'shows/shows-*.json', multiLine=True,
                  schema=embedded_schema,
                  mode="FAILFAST")

shows_with_schema.printSchema()

root
 |-- _embedded: struct (nullable = true)
 |    |-- episodes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _links: struct (nullable = true)
 |    |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- airdate: date (nullable = true)
 |    |    |    |-- airstamp: timestamp (nullable = true)
 |    |    |    |-- airtime: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- medium: string (nullable = true)
 |    |    |    |    |-- original: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- number: long (nullable = true)
 |    |    |    |-- runtime: long (nullable = true)
 |    |    |    |-- season: long (nullable = true)
 |    |    |    |-- summary: string (nullable = true)
 |    |    |    |-- url: string (nullable = t

In [127]:
# verify the date type for the airdate and airstamp

for column in ['airdate', 'airstamp']:
    shows_with_schema.select(f"_embedded.episodes.{column}").select(F.explode(column)).show(5, False)


+----------+
|col       |
+----------+
|1985-09-14|
|1985-09-21|
|1985-09-28|
|1985-10-05|
|1985-10-19|
+----------+
only showing top 5 rows

+-------------------+
|col                |
+-------------------+
|1985-09-14 18:00:00|
|1985-09-21 18:00:00|
|1985-09-28 18:00:00|
|1985-10-05 18:00:00|
|1985-10-19 18:00:00|
+-------------------+
only showing top 5 rows



### Getting to the "just right" data frame: Explode and collect

In [128]:
## Explode
episodes = shows.select(
    "id", F.explode("_embedded.episodes").alias("episodes")
)                                                              
episodes.show(5, truncate=70)

+---+----------------------------------------------------------------------+
| id|                                                              episodes|
+---+----------------------------------------------------------------------+
|143|{{{http://api.tvmaze.com/episodes/10897}}, 2014-04-06, 2014-04-07T0...|
|143|{{{http://api.tvmaze.com/episodes/10898}}, 2014-04-13, 2014-04-14T0...|
|143|{{{http://api.tvmaze.com/episodes/10899}}, 2014-04-20, 2014-04-21T0...|
|143|{{{http://api.tvmaze.com/episodes/10900}}, 2014-04-27, 2014-04-28T0...|
|143|{{{http://api.tvmaze.com/episodes/10901}}, 2014-05-04, 2014-05-05T0...|
+---+----------------------------------------------------------------------+
only showing top 5 rows



In [129]:
#posexplode

episode_name_id = shows.select(
    F.map_from_arrays(                                         
        F.col("_embedded.episodes.id"), F.col("_embedded.episodes.name")
    ).alias("name_id")
)
 
episode_name_id = episode_name_id.select(
    F.posexplode("name_id").alias("position", "id", "name") 
)
 
episode_name_id.show(5)

+--------+-----+--------------------+
|position|   id|                name|
+--------+-----+--------------------+
|       0|10897|Minimum Viable Pr...|
|       1|10898|       The Cap Table|
|       2|10899|Articles of Incor...|
|       3|10900|    Fiduciary Duties|
|       4|10901|      Signaling Risk|
+--------+-----+--------------------+
only showing top 5 rows



#### Note:
Both explode() and posexplode() will skip any null values in the array or the map. If you want to have null as records, you can use explode_outer() or posexplode_outer() the same way.

In [135]:
# collect_list(), collect_set()

## Explode
episodes = shows.select(
    "id", F.explode("_embedded.episodes").alias("episodes")
)

#episodes.show()

collected=episodes.groupBy('id').agg(F.collect_list("episodes").alias("episodes"))

collected.printSchema()

root
 |-- id: long (nullable = true)
 |-- episodes: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- self: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |-- airdate: string (nullable = true)
 |    |    |-- airstamp: string (nullable = true)
 |    |    |-- airtime: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- image: struct (nullable = true)
 |    |    |    |-- medium: string (nullable = true)
 |    |    |    |-- original: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- number: long (nullable = true)
 |    |    |-- runtime: long (nullable = true)
 |    |    |-- season: long (nullable = true)
 |    |    |-- summary: string (nullable = true)
 |    |    |-- url: string (nullable = true)



### Struct as a function

In [136]:
struct_ex = shows.select(
    F.struct(                                          
         F.col("status"), F.col("weight"), F.lit(True).alias("has_watched")
    ).alias("info")
)
struct_ex.show(1, False)

+-----------------+
|info             |
+-----------------+
|{Ended, 96, true}|
+-----------------+



### In class exercise

#### Exercise 6.6
Using three_shows, compute the time between the first and last episodes for each show. Which show had the longest tenure?

In [8]:
three_shows=spark.read.json(root_path+'shows/shows-*.json', multiLine=True)
#three_shows.printSchema()

### exercise 6.7
Take the shows data frame and extract the air date and name of each episode in two array columns.

In [9]:
three_shows=spark.read.json(root_path+'shows/shows-*.json', multiLine=True)
#three_shows.printSchema()

#### Exercise 6.8

Given the following data frame, create a new data frame that contains a single map from one to square: