In [None]:
'''
    - PySpark StructType & StructField classes are used to programmatically specify the schema to the DataFrame and creating 
      complex columns like nested struct, array and map columns.
    - StructType is a collection of StructField’s that defines column name, column data type
    


In [None]:
'''
# 1. StructType – Defines the structure of the Dataframe

    - PySpark provides from pyspark.sql.types import StructType class to define the structure of the DataFrame.
    - printSchema() method on the DataFrame shows StructType columns as “struct”.  

In [None]:
'''
# 2.StructField – Defines the metadata of the DataFrame column

    - PySpark provides pyspark.sql.types import StructField class to define the columns which includes column name(String), 
      column type (DataType), nullable column (Boolean) and metadata (MetaData)

In [None]:
# 3. Using PySpark StructType & StructField with DataFrame

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

data = [("James","","Smith","36636","M",3000),("Michael","Rose","","40288","M",4000),
        ("Robert","","Williams","42114","M",4000),("Maria","Anne","Jones","39192","F",4000),
        ("Jen","Mary","Brown","","F",-1)
       ]

schema = StructType([StructField("firstname",StringType(),True),StructField("middlename",StringType(),True),
                     StructField("lastname",StringType(),True),StructField("id", StringType(), True),
                     StructField("gender", StringType(), True),StructField("salary", IntegerType(), True)])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

In [None]:
# 4. Defining Nested StructType object struct

structureData = [(("James","","Smith"),"36636","M",3100),(("Michael","Rose",""),"40288","M",4300),
                 (("Robert","","Williams"),"42114","M",1400),(("Maria","Anne","Jones"),"39192","F",5500),
                 (("Jen","Mary","Brown"),"","F",-1)
                ]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

In [None]:
# 5. Adding & Changing struct of the DataFrame

from pyspark.sql.functions import col,struct,when

updatedDF = df2.withColumn("OtherInfo",struct(col("id").alias("identifier"),col("gender").alias("gender"),
                                              col("salary").alias("salary"),
                                              when(col("salary").cast(IntegerType()) < 2000,"Low")
                                              .when(col("salary").cast(IntegerType()) < 4000,"Medium")
                                              .otherwise("High").alias("Salary_Grade")
                                             )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)

In [None]:
# 6. Using SQL ArrayType and MapType

arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

In [None]:
# 7. Creating StructType object struct from JSON file

print(df2.schema.json())


import json

schemaFromJson = StructType.fromJson(json.loads(schema.json))

df3 = spark.createDataFrame(spark.sparkContext.parallelize(structureData),schemaFromJson)

df3.printSchema()

In [None]:
# 8. Creating StructType object struct from DDL String


In [None]:
# 9. Checking if a Column Exists in a DataFrame

print(df.schema.fieldNames.contains("firstname"))
print(df.schema.contains(StructField("firstname",StringType,true)))

In [None]:
# 10. Complete Example of PySpark StructType & StructField

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,ArrayType,MapType
from pyspark.sql.functions import col,struct,when

spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()

data = [("James","","Smith","36636","M",3000),("Michael","Rose","","40288","M",4000),
        ("Robert","","Williams","42114","M",4000),("Maria","Anne","Jones","39192","F",4000),
        ("Jen","Mary","Brown","","F",-1)
       ]

schema = StructType([ 
                    StructField("firstname",StringType(),True), 
                    StructField("middlename",StringType(),True), 
                    StructField("lastname",StringType(),True), 
                    StructField("id", StringType(), True), 
                    StructField("gender", StringType(), True), 
                    StructField("salary", IntegerType(), True) 
                    ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)


updatedDF = df2.withColumn("OtherInfo", 
    struct(col("id").alias("identifier"),
    col("gender").alias("gender"),
    col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)


""" Array & Map"""


arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])