In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("StructType vs MapType").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 15:25:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/02/26 15:26:11 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
from pyspark.sql.types import (
    StringType,
    StructType,
    IntegerType,
    StructField,
    MapType,
    ArrayType,
)

In [3]:
# sample data
structureData = [
    (("James", "Will", "Smith"), 111, "HR"),
    (("Michael", "Rose", "Dan"), 222, "SALES"),
    (("Robert", "Ray", "Williams"), 333, "IT"),
    (("Maria", "Anne", "Jones"), 444, "IT"),
    (("Jen", "Mary", "Brown"), 555, "HR"),
]

In [4]:
# Define Nested Structure
structureSchema = StructType(
    [
        StructField(
            "Name",
            StructType(
                [
                    StructField("FirstName", StringType(), False),
                    StructField("MiddleName", StringType(), True),
                    StructField("LastName", StringType(), True),
                ]
            ),
        ),
        StructField("ID", IntegerType(), True),
        StructField("Department", StringType(), True),
    ]
)

In [5]:
# create dataframe using nested structure
dfNested = spark.createDataFrame(data=structureData, schema=structureSchema)
dfNested.show(truncate=False)
dfNested.printSchema()

                                                                                

+-----------------------+---+----------+
|Name                   |ID |Department|
+-----------------------+---+----------+
|{James, Will, Smith}   |111|HR        |
|{Michael, Rose, Dan}   |222|SALES     |
|{Robert, Ray, Williams}|333|IT        |
|{Maria, Anne, Jones}   |444|IT        |
|{Jen, Mary, Brown}     |555|HR        |
+-----------------------+---+----------+

root
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = false)
 |    |-- MiddleName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Department: string (nullable = true)



##### MapType

In [7]:
data = [
    (
        111,
        "Mike",
        {"TV": "LG", "Refrigerator": "Samssung", "Oven": "Philips", "AC": "Voltas"},
    ),
    (222, "David", {"AC": "Samsung", "Washing machine": "LG"}),
    (333, "Thomas", {"TV": "Croma"}),
    (444, "Williams", None),
]

schema = StructType(
    [
        StructField("ID", IntegerType(), True),
        StructField("Name", StringType(), True),
        StructField("Utilities", MapType(StringType(), StringType()), True),
    ]
)

In [8]:
# create dataframe using nested structure
df = spark.createDataFrame(data=data, schema=schema)
df.show(truncate=False)
df.printSchema()

+---+--------+-------------------------------------------------------------------+
|ID |Name    |Utilities                                                          |
+---+--------+-------------------------------------------------------------------+
|111|Mike    |{Refrigerator -> Samssung, AC -> Voltas, TV -> LG, Oven -> Philips}|
|222|David   |{AC -> Samsung, Washing machine -> LG}                             |
|333|Thomas  |{TV -> Croma}                                                      |
|444|Williams|NULL                                                               |
+---+--------+-------------------------------------------------------------------+

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Utilities: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

