In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = SparkConf().setAppName('StructType').setMaster('yarn') 
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql.functions import concat_ws, expr, struct, col, lit

struct_col = struct('f1', 'f2', 'f3')
print(struct_col)

Column<'struct(f1, f2, f3)'>


In [4]:
df = spark.createDataFrame(
        [(1, 'aa', True), 
         (2, 'bb', False), 
         (3, 'cc', True)], "f1: int, f2: string, f3: boolean")

df2 = df.withColumn('struct_data', struct_col)
df2.show(truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+---+---+-----+--------------+
|f1 |f2 |f3   |struct_data   |
+---+---+-----+--------------+
|1  |aa |true |{1, aa, true} |
|2  |bb |false|{2, bb, false}|
|3  |cc |true |{3, cc, true} |
+---+---+-----+--------------+



                                                                                

In [5]:
struct_col = struct('f1', lit('const').alias('my_const'),  
    expr("case when f1%2 = 0 then 'even' else 'odd' end").alias('parity'))

df3 = df.withColumn('struct_data', struct_col)
df3.show(truncate=False)
df3.printSchema()

+---+---+-----+----------------+
|f1 |f2 |f3   |struct_data     |
+---+---+-----+----------------+
|1  |aa |true |{1, const, odd} |
|2  |bb |false|{2, const, even}|
|3  |cc |true |{3, const, odd} |
+---+---+-----+----------------+

root
 |-- f1: integer (nullable = true)
 |-- f2: string (nullable = true)
 |-- f3: boolean (nullable = true)
 |-- struct_data: struct (nullable = false)
 |    |-- f1: integer (nullable = true)
 |    |-- my_const: string (nullable = false)
 |    |-- parity: string (nullable = false)



In [6]:
# Spark 3.5
# struct_col = f.named_struct(lit('f1_renamed'), 'f1',
#                             lit('f2_renamed'), 'f2', 
#                             lit('f3_renamed'), 'f3')
# df4 = df.withColumn('struct_data', struct_col)
# df4.show(truncate=False)
# df4.printSchema()

In [7]:
struct_col = struct('f1', 'f2', 'f3')

df5 = df.withColumn('struct_data', struct_col)
df5 = df5.select('f1', 'f2', 'f3', 'struct_data', 
                     'struct_data.f1',
                     expr('struct_data.f1').alias('f1_extracted'),
                     col('struct_data.f2').alias('f2_extracted'),
                     df5.struct_data.f3.alias('f3_extracted'))

df5.show(truncate=False)

+---+---+-----+--------------+---+------------+------------+------------+
|f1 |f2 |f3   |struct_data   |f1 |f1_extracted|f2_extracted|f3_extracted|
+---+---+-----+--------------+---+------------+------------+------------+
|1  |aa |true |{1, aa, true} |1  |1           |aa          |true        |
|2  |bb |false|{2, bb, false}|2  |2           |bb          |false       |
|3  |cc |true |{3, cc, true} |3  |3           |cc          |true        |
+---+---+-----+--------------+---+------------+------------+------------+



In [8]:
struct_col = struct('f1', 'f2', 'f3')
f_concat = concat_ws('_', struct_col.getField('f1'), 
                          struct_col.getField('f2'),
                          struct_col.getField('f3'))

df6 = df.withColumn('concatenated_fields', f_concat)
df6.show(truncate=False)

+---+---+-----+-------------------+
|f1 |f2 |f3   |concatenated_fields|
+---+---+-----+-------------------+
|1  |aa |true |1_aa_true          |
|2  |bb |false|2_bb_false         |
|3  |cc |true |3_cc_true          |
+---+---+-----+-------------------+



In [9]:
struct_col = struct('f1', 'f2', 'f3')

df7 = df.withColumn('struct_col', struct_col)
df7 = df7.withColumn('struct_col_new', 
                    struct_col\
                 .withField('f2', concat_ws('_', \
                            struct_col.getField('f2'), lit('modified')))\
                 .withField('f4', lit('added'))
                   )
df7.show(truncate=False)

+---+---+-----+--------------+------------------------------+
|f1 |f2 |f3   |struct_col    |struct_col_new                |
+---+---+-----+--------------+------------------------------+
|1  |aa |true |{1, aa, true} |{1, aa_modified, true, added} |
|2  |bb |false|{2, bb, false}|{2, bb_modified, false, added}|
|3  |cc |true |{3, cc, true} |{3, cc_modified, true, added} |
+---+---+-----+--------------+------------------------------+



In [11]:
spark.stop()