**Dealing with Nested Schema:**

In [None]:
import findspark
findspark.init
import getpass
from pyspark.sql import SparkSession

username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config("spark.sql.catalogImplementation", "hive"). \
    config("spark.sql.warehouse.dir",f"/Users/{username}/Documents/data/warehouse"). \
    enableHiveSupport(). \
    master("local"). \
    getOrCreate()

In [5]:
from pyspark.sql.types import *

In [10]:
customer_schema = StructType([
    StructField("customer_id",IntegerType()),
    StructField("fullname", StructType([
        StructField("firstname",StringType()),
        StructField("lastname",StringType()),
    ])),
    StructField("city",StringType())
])

In [11]:
df = spark.read.format("json").schema(customer_schema).load("/Users/sugumarsrinivasan/Documents/data/customer.json")

In [12]:
df.show(5)

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [13]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [14]:
ddl_schema = "customer_id long, fullname struct<firstname string, lastname string>, city string"

In [15]:
df = spark.read.format("json").schema(ddl_schema).load("/Users/sugumarsrinivasan/Documents/data/customer.json")

In [16]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [17]:
customer_list = [
    (1, ("sumit","mittal"),"bangalore"),
    (2, ("ram","kumar"),"hyderabad"),
    (3, ("vijay","shankar"),"pune")
]

In [18]:
ddl_schema = "customer_id long, fullname struct<firstname string, lastname string>, city string"

In [19]:
df = spark.createDataFrame(customer_list,ddl_schema)

In [20]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [21]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [22]:
customer_schema = StructType([
    StructField("customer_id",IntegerType()),
    StructField("fullname", StructType([
        StructField("firstname",StringType()),
        StructField("lastname",StringType()),
    ])),
    StructField("city",StringType())
])

In [23]:
df = spark.createDataFrame(customer_list,customer_schema)

In [24]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [25]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)

