In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkContext


spark = SparkSession.builder.appName(
    "Split Array Elements into Separate Columns"
).getOrCreate()
sc = spark.sparkContext

- Create sample Dataframe

In [5]:
df = spark.createDataFrame(
    sc.parallelize(
        [["ABC", [1, 2, 3]], ["XYZ", [2, None, 4]], ["KLM", [8, 7]], ["IJK", [5]]]
    ),
    ["key", "value"],
)
df.show()

                                                                                

+---+------------+
|key|       value|
+---+------------+
|ABC|   [1, 2, 3]|
|XYZ|[2, NULL, 4]|
|KLM|      [8, 7]|
|IJK|         [5]|
+---+------------+



- split array values into separate columns

In [6]:
df.select("key", df.value[0], df.value[1], df.value[2]).show()

+---+--------+--------+--------+
|key|value[0]|value[1]|value[2]|
+---+--------+--------+--------+
|ABC|       1|       2|       3|
|XYZ|       2|    NULL|       4|
|KLM|       8|       7|    NULL|
|IJK|       5|    NULL|    NULL|
+---+--------+--------+--------+



#### How to automate the solution?

- Determine the size of each array

In [7]:
from pyspark.sql.functions import size, col

In [8]:
dfSize = df.select("key", "value", size("value").alias("NoOfArrayElements"))
dfSize.show()

+---+------------+-----------------+
|key|       value|NoOfArrayElements|
+---+------------+-----------------+
|ABC|   [1, 2, 3]|                3|
|XYZ|[2, NULL, 4]|                3|
|KLM|      [8, 7]|                2|
|IJK|         [5]|                1|
+---+------------+-----------------+



- Get the maximum size of all arrays

In [11]:
max_value = dfSize.agg({"NoOfArrayElements": "max"}).collect()[0][0]
max_value

3

- UDF to convert Array Elements into columns

In [12]:
def arraySplitIntoCols(df, maxElements):
    for i in range(maxElements):
        df = df.withColumn(f"new_col_{i}", df.value[i])
    return df

- UDF Call

In [13]:
dfOut = arraySplitIntoCols(df, max_value)
dfOut.show()

+---+------------+---------+---------+---------+
|key|       value|new_col_0|new_col_1|new_col_2|
+---+------------+---------+---------+---------+
|ABC|   [1, 2, 3]|        1|        2|        3|
|XYZ|[2, NULL, 4]|        2|     NULL|        4|
|KLM|      [8, 7]|        8|        7|     NULL|
|IJK|         [5]|        5|     NULL|     NULL|
+---+------------+---------+---------+---------+

