In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('json').master('local[*]').getOrCreate()

In [2]:
df = spark.read.format('json').option('multiLine','true').option('inferSchema','true').load('resturant_json_data.json')

In [3]:
df.show()

+----+-------+--------------------+-------------+-------------+-------------+------+
|code|message|         restaurants|results_found|results_shown|results_start|status|
+----+-------+--------------------+-------------+-------------+-------------+------+
|NULL|   NULL|                  []|            0|            0|            1|  NULL|
|NULL|   NULL|[{{{17066603}, b9...|         6835|           20|            1|  NULL|
|NULL|   NULL|                  []|            0|            0|            1|  NULL|
|NULL|   NULL|                  []|            0|            0|            1|  NULL|
|NULL|   NULL|[{{{17093124}, b9...|         8680|           20|            1|  NULL|
|NULL|   NULL|                  []|            0|            0|            1|  NULL|
|NULL|   NULL|                  []|            0|            0|            1|  NULL|
|NULL|   NULL|[{{{17580142}, b9...|          943|           20|            1|  NULL|
|NULL|   NULL|                  []|            0|            0|  

In [4]:
df.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- restaurants: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- restaurant: struct (nullable = true)
 |    |    |    |-- R: struct (nullable = true)
 |    |    |    |    |-- res_id: long (nullable = true)
 |    |    |    |-- apikey: string (nullable = true)
 |    |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |    |-- cuisines: string (nullable = true)
 |    |    |    |-- currency: string (nullable = true)
 |    |    |    |-- deeplink: string (nullable = true)
 |    |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- events_url: string (nullable = true)
 |    |    |    |-- featured_image: string (nullable = true)
 |    |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |    |-- has_table_booking: long (nullable = true)
 |    |    |    |-- i

In [8]:
# we can see that restaurants is ArrayType, lets try to flatten it using explode.
from pyspark.sql.functions import explode
df1 = df.withColumn('new_restuarants',explode(df.restaurants)).drop('restaurants')
df1.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_restuarants: struct (nullable = true)
 |    |-- restaurant: struct (nullable = true)
 |    |    |-- R: struct (nullable = true)
 |    |    |    |-- res_id: long (nullable = true)
 |    |    |-- apikey: string (nullable = true)
 |    |    |-- average_cost_for_two: long (nullable = true)
 |    |    |-- cuisines: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- deeplink: string (nullable = true)
 |    |    |-- establishment_types: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- events_url: string (nullable = true)
 |    |    |-- featured_image: string (nullable = true)
 |    |    |-- has_online_delivery: long (nullable = true)
 |    |    |-- has_tab

In [15]:
#we can see that new_restaurants and the restuarant inside it is same. So we remove one level of nest.
df2 = df1.withColumn('Restuarant_Data',df1.new_restuarants['restaurant']).drop(df1.new_restuarants)

In [17]:
df2.printSchema()

root
 |-- code: long (nullable = true)
 |-- message: string (nullable = true)
 |-- results_found: long (nullable = true)
 |-- results_shown: long (nullable = true)
 |-- results_start: string (nullable = true)
 |-- status: string (nullable = true)
 |-- Restuarant_Data: struct (nullable = true)
 |    |-- R: struct (nullable = true)
 |    |    |-- res_id: long (nullable = true)
 |    |-- apikey: string (nullable = true)
 |    |-- average_cost_for_two: long (nullable = true)
 |    |-- cuisines: string (nullable = true)
 |    |-- currency: string (nullable = true)
 |    |-- deeplink: string (nullable = true)
 |    |-- establishment_types: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- events_url: string (nullable = true)
 |    |-- featured_image: string (nullable = true)
 |    |-- has_online_delivery: long (nullable = true)
 |    |-- has_table_booking: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- is_delivering_now: long (nul

In [23]:
res_data = df2.select('Restuarant_Data.*')
res_data.printSchema()

root
 |-- R: struct (nullable = true)
 |    |-- res_id: long (nullable = true)
 |-- apikey: string (nullable = true)
 |-- average_cost_for_two: long (nullable = true)
 |-- cuisines: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- deeplink: string (nullable = true)
 |-- establishment_types: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- events_url: string (nullable = true)
 |-- featured_image: string (nullable = true)
 |-- has_online_delivery: long (nullable = true)
 |-- has_table_booking: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_delivering_now: long (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- city_id: long (nullable = true)
 |    |-- country_id: long (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- locality: string (nullable = true)
 |    |-- locality_verbose: string (nullable = t

In [35]:
# we can see that there are many nested levels, if we try to flatten everything it will take time.
#lets decide on few columns we want and try to get them by flatteing the required complex types
# we will need output cols - res_id, currency, has_table_booking, city, country_id, name, aggregate_rating.
res_data1 = res_data.select(res_data.R.res_id.alias('Res_ID'),\
                             'name',\
                             'currency',\
                             'has_table_booking',\
                             res_data.location.city.alias('city'),\
                             res_data.location.country_id.alias('country_id'),\
                             res_data.user_rating.aggregate_rating.alias('Rating'),\
                             'offers')


In [36]:
res_data1.show()

+--------+--------------------+--------+-----------------+-------+----------+------+------+
|  Res_ID|                name|currency|has_table_booking|   city|country_id|Rating|offers|
+--------+--------------------+--------+-----------------+-------+----------+------+------+
|17066603|            The Coop|       $|                0|Orlando|       216|   3.6|    []|
|17059541|Maggiano's Little...|       $|                0|Orlando|       216|   4.4|    []|
|17064405|Tako Cheena by Po...|       $|                0|Orlando|       216|   4.4|    []|
|17057797|Bosphorous Turkis...|       $|                0|Orlando|       216|   4.2|    []|
|17057591|Bahama Breeze Isl...|       $|                0|Orlando|       216|   4.3|    []|
|17064266|Hawkers Asian Str...|       $|                0|Orlando|       216|   4.6|    []|
|17060516|Seasons 52 Fresh ...|       $|                0|Orlando|       216|   4.4|    []|
|17060320|Raglan Road Irish...|       $|                0|Orlando|       216|   

In [38]:
#Note for struct we usually access its fields by using dot and array is usually exploded depening on the data. Else if we want to access a single element indexing is used or getItem().
# in our data we still have an array type,lets try to explode that.

res_data1.select('*',explode('offers')).show()

+------+----+--------+-----------------+----+----------+------+------+---+
|Res_ID|name|currency|has_table_booking|city|country_id|Rating|offers|col|
+------+----+--------+-----------------+----+----------+------+------+---+
+------+----+--------+-----------------+----+----------+------+------+---+



In [40]:
# we are not getting any data because, offers has array containing only NULLs. 
#if explode sees an element in array as NULL, it will ignore it, hence as every row has NULL array for offers, no row is present in output
# we can use explode_outer to solve this problem
from pyspark.sql.functions import explode_outer
res_data1.select('*',explode_outer('offers').alias('Offers_newcolumn')).show()

#now we are getting data

+--------+--------------------+--------+-----------------+-------+----------+------+------+----------------+
|  Res_ID|                name|currency|has_table_booking|   city|country_id|Rating|offers|Offers_newcolumn|
+--------+--------------------+--------+-----------------+-------+----------+------+------+----------------+
|17066603|            The Coop|       $|                0|Orlando|       216|   3.6|    []|            NULL|
|17059541|Maggiano's Little...|       $|                0|Orlando|       216|   4.4|    []|            NULL|
|17064405|Tako Cheena by Po...|       $|                0|Orlando|       216|   4.4|    []|            NULL|
|17057797|Bosphorous Turkis...|       $|                0|Orlando|       216|   4.2|    []|            NULL|
|17057591|Bahama Breeze Isl...|       $|                0|Orlando|       216|   4.3|    []|            NULL|
|17064266|Hawkers Asian Str...|       $|                0|Orlando|       216|   4.6|    []|            NULL|
|17060516|Seasons 5