In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.types import StructType,IntegerType,StringType,StructField

In [4]:
spark = SparkSession.builder.appName("getdata").getOrCreate()

In [10]:
data = [(("James","","Smith"),"36363","M",3100),(("Michael","","Rose"),"40288","M",4300),(("Robert","","Williams"),"42114","M",1400)]

In [15]:
structureschema = StructType([StructField("name",StructType([StructField("firstname",StringType(),True),StructField("middlename",StringType(),True),StructField("lastname",StringType(),True)])),StructField("id",StringType(),True),StructField("gender",StringType(),True),StructField("salary",IntegerType(),True)])

In [16]:
df = spark.createDataFrame(data=data,schema=structureschema)

In [17]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [18]:
df.show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36363|     M|  3100|
|   {Michael, , Rose}|40288|     M|  4300|
|{Robert, , Williams}|42114|     M|  1400|
+--------------------+-----+------+------+



In [21]:
df.select("name.firstname","name.lastname").show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|  Michael|    Rose|
|   Robert|Williams|
+---------+--------+



In [22]:
# we can change the struct of the exisiting data frame and add a new Struct Type to it

In [25]:
from pyspark.sql.functions import struct,col,when

updatedf = df.withColumn("OtherInfo",struct(col("id").alias("identifier"),col("gender").alias("gender"),
                                            col("salary").alias("salary"),
                                            when(col("salary").cast(IntegerType()) < 2000 ,"Low")
                                           
                                           )).drop("id","gender","salary")

In [26]:
updatedf.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: integer (nullable = true)
 |    |-- col4: string (nullable = true)



In [33]:
if "firstname" in df.columns:
     print("column exists")