In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("nested_schema").getOrCreate() 

###### schema define

In [27]:
ddl_schema = "customer_id long , fullname struct<first_name:string, last_name:string> ,city string" 

In [58]:
from pyspark.sql.types import *

struct_schema = StructType([
    StructField("customer_id",LongType()),
    StructField("fullname",StructType([
                                        StructField("first_name",StringType()),
                                        StructField("last_name",StringType())
                                    ])),
    StructField("city",StringType())
])


###### multiline_nested_json_schema_enforcement

In [28]:
df = spark.read \
    .format("json") \
        .schema(ddl_schema) \
            .option("multiline","true") \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\multiline_nested_json.json")

In [29]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [34]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



In [59]:
df_again = spark.read \
    .format("json") \
        .schema(struct_schema) \
            .option("multiline","true") \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\multiline_nested_json.json")

In [60]:
df_again.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [61]:
df_again.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



###### python_list_nested

In [62]:
customer_list = [
    (1,("tarun","kumar"),"bangalore"),
    (2,("sumit","kumar"),"delhi"),
    (3,("manish","kumar"),"mumbai"),
    (4,("ashwini","kumar"),"chennai")
]

In [63]:
df_ls = spark.createDataFrame(customer_list,ddl_schema)

In [64]:
df_ls.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [65]:
df_ls.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



In [66]:
df_ls_struct = spark.createDataFrame(customer_list,struct_schema)

In [67]:
df_ls_struct.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [68]:
df_ls_struct.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



###### line_delimited_nested_json_schema_enforcement

In [30]:
df1 = spark.read \
    .format("json") \
        .schema(ddl_schema) \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\line_delimited_nested_json.json")

In [31]:
df1.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [35]:
df1.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



###### line_delimited_nested_with_comma_json_schema_enforcement

In [32]:
df3 = spark.read \
    .format("json") \
        .schema(ddl_schema) \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\line_delimited_nested_with_comma.json")

In [33]:
df3.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1|  {tarun, kumar}|bangalore|
|          2|  {sumit, kumar}|    delhi|
|          3| {manish, kumar}|   mumbai|
|          4|{ashwini, kumar}|  chennai|
+-----------+----------------+---------+



In [36]:
df3.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |-- city: string (nullable = true)



###### multiline_nested_json_infer_schema


In [37]:
df4 = spark.read \
    .format("json") \
        .option("inferSchema","true") \
            .option("multiline","true") \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\multiline_nested_json.json")

In [38]:
df4.show()

+---------+-----------+----------------+
|     city|customer_id|        fullname|
+---------+-----------+----------------+
|bangalore|          1|  {tarun, kumar}|
|    delhi|          2|  {sumit, kumar}|
|   mumbai|          3| {manish, kumar}|
|  chennai|          4|{ashwini, kumar}|
+---------+-----------+----------------+



In [39]:
df4.printSchema()

root
 |-- city: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)



###### line_delimited_nested_json_infer_schema


In [42]:
df5 = spark.read \
    .format("json") \
        .option("inferSchema","true") \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\line_delimited_nested_json.json")

In [43]:
df5.show()

+---------+-----------+----------------+
|     city|customer_id|        fullname|
+---------+-----------+----------------+
|bangalore|          1|  {tarun, kumar}|
|    delhi|          2|  {sumit, kumar}|
|   mumbai|          3| {manish, kumar}|
|  chennai|          4|{ashwini, kumar}|
+---------+-----------+----------------+



In [44]:
df5.printSchema()

root
 |-- city: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)



###### line_delimited_nested_with_comma_json_infer_schema

In [45]:
df6 = spark.read \
    .format("json") \
        .option("inferSchema","true") \
                .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\data\line_delimited_nested_with_comma.json")

In [46]:
df6.show()

+---------+-----------+----------------+
|     city|customer_id|        fullname|
+---------+-----------+----------------+
|bangalore|          1|  {tarun, kumar}|
|    delhi|          2|  {sumit, kumar}|
|   mumbai|          3| {manish, kumar}|
|  chennai|          4|{ashwini, kumar}|
+---------+-----------+----------------+



In [47]:
df6.printSchema()

root
 |-- city: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)



In [69]:
spark.stop()