In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/itv009538/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

### Creating nested schema

In [2]:
ddlSchema = "customer_id long, fullname struct<firstname:string, lastname:string>, city string"

In [3]:
df = spark.read.format("json").schema(ddlSchema).load("/public/trendytech/datasets/customer_nested/*")

In [4]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [5]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [7]:
from pyspark.sql.types import *

In [23]:
customer_schema = StructType([
StructField("customer_id", LongType()),
StructField("fullname", StructType([StructField("firstname", StringType()), StructField("lastname", StringType())])),
StructField("city", StringType())
])

In [24]:
df = spark.read.format("json").schema(customer_schema).load("/public/trendytech/datasets/customer_nested/*")

In [25]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
|          1| {sumit, mittal}|bangalore|
+-----------+----------------+---------+



In [26]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [31]:
customer_list = [
    (1, ("sumit", "mittal"), "bangalore"),
    (2, ("ram", "kumar"), "hyderabad"),
    (3, ("vijay", "shankar"), "pune"),
]

In [32]:
ddlSchema = "customer_id long, fullname struct<firstname:string, lastname:string>, city string"

In [33]:
df = spark.createDataFrame(customer_list, ddlSchema)

In [34]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+



In [35]:
customer_schema = StructType([
StructField("customer_id", LongType()),
StructField("fullname", StructType([StructField("firstname", StringType()), StructField("lastname", StringType())])),
StructField("city", StringType())
])

In [36]:
df = spark.createDataFrame(customer_list, customer_schema)

In [37]:
df.show()

+-----------+----------------+---------+
|customer_id|        fullname|     city|
+-----------+----------------+---------+
|          1| {sumit, mittal}|bangalore|
|          2|    {ram, kumar}|hyderabad|
|          3|{vijay, shankar}|     pune|
+-----------+----------------+---------+

