##Explode

In [0]:
# Sample data with an array of structs
data = [("Alice", [{"subject": "Math", "score": 90}, {"subject": "English", "score": 85}]),
        ("Bob", [{"subject": "Math", "score": 88}, {"subject": "English", "score": 92}]),
        ("Charlie", [{"subject": "Math", "score": 78}]),
        ("David", [{"subject": "Math", "score": 95}, {"subject": "English", "score": 89}])]


In [0]:
schema = ["name", "subjects_and_scores"]

In [0]:
df = spark.createDataFrame(data, schema)

In [0]:
df.show(truncate=False)

+-------+-------------------------------------------------------------------+
|name   |subjects_and_scores                                                |
+-------+-------------------------------------------------------------------+
|Alice  |[{score -> 90, subject -> Math}, {score -> 85, subject -> English}]|
|Bob    |[{score -> 88, subject -> Math}, {score -> 92, subject -> English}]|
|Charlie|[{score -> 78, subject -> Math}]                                   |
|David  |[{score -> 95, subject -> Math}, {score -> 89, subject -> English}]|
+-------+-------------------------------------------------------------------+



In [0]:
from pyspark.sql.functions import *

In [0]:

exploded_df = df.select("name",explode("subjects_and_scores").alias("subject_score"))

In [0]:
exploded_df.show(truncate=False)

+-------+---------------------------------+
|name   |subject_score                    |
+-------+---------------------------------+
|Alice  |{score -> 90, subject -> Math}   |
|Alice  |{score -> 85, subject -> English}|
|Bob    |{score -> 88, subject -> Math}   |
|Bob    |{score -> 92, subject -> English}|
|Charlie|{score -> 78, subject -> Math}   |
|David  |{score -> 95, subject -> Math}   |
|David  |{score -> 89, subject -> English}|
+-------+---------------------------------+



In [0]:
exploded_df.select('name','subject_score.score','subject_score.subject').show()

+-------+-----+-------+
|   name|score|subject|
+-------+-----+-------+
|  Alice|   90|   Math|
|  Alice|   85|English|
|    Bob|   88|   Math|
|    Bob|   92|English|
|Charlie|   78|   Math|
|  David|   95|   Math|
|  David|   89|English|
+-------+-----+-------+



In [0]:
df_select=exploded_df.select('name','subject_score.score','subject_score.subject')

In [0]:
df_select.show(truncate=False)


+-------+-----+-------+
|name   |score|subject|
+-------+-----+-------+
|Alice  |90   |Math   |
|Alice  |85   |English|
|Bob    |88   |Math   |
|Bob    |92   |English|
|Charlie|78   |Math   |
|David  |95   |Math   |
|David  |89   |English|
+-------+-----+-------+



In [0]:
data1 = [('Alice',[{"scores":[90,100,120]}])]

In [0]:
schema1 = ['name','scores']

In [0]:
df1=spark.createDataFrame(data1,schema1)

In [0]:
df1.show(truncate=False)

+-----+----------------------------+
|name |scores                      |
+-----+----------------------------+
|Alice|[{scores -> [90, 100, 120]}]|
+-----+----------------------------+



In [0]:
df2 = df1.select("name",explode("scores"))

In [0]:
df2.show(truncate=False)

+-----+--------------------------+
|name |col                       |
+-----+--------------------------+
|Alice|{scores -> [90, 100, 120]}|
+-----+--------------------------+



In [0]:
df3 = df2.select((element_at(col('col.scores'),2)).alias("2nd element"))

In [0]:
df3.show(truncate=False)

+-----------+
|2nd element|
+-----------+
|100        |
+-----------+



In [0]:
df3 = df2.select(explode('scores'))

In [0]:
df3.show()

+---+
|col|
+---+
| 90|
|100|
|120|
+---+



In [0]:
py_list=[10,20,30]

print(py_list[1])



20


#Split

In [0]:
# Sample data with a string column
data = [("Alice", "Math,English,History"),
        ("Bob", "Physics,Chemistry"),
        ("Charlie", "Biology"),
        ("David", "Math,Physics,Chemistry,History")]
# Define the schema
schema = ["name", "subjects"]

df=spark.createDataFrame(data,schema)
df.show(truncate=False)


+-------+------------------------------+
|name   |subjects                      |
+-------+------------------------------+
|Alice  |Math,English,History          |
|Bob    |Physics,Chemistry             |
|Charlie|Biology                       |
|David  |Math,Physics,Chemistry,History|
+-------+------------------------------+



In [0]:
df.show(truncate=False)

+-------+------------------------------+
|name   |subjects                      |
+-------+------------------------------+
|Alice  |Math,English,History          |
|Bob    |Physics,Chemistry             |
|Charlie|Biology                       |
|David  |Math,Physics,Chemistry,History|
+-------+------------------------------+



In [0]:
from pyspark.sql.functions import split

In [0]:
df2=df.select("name",split("subjects",","))
df2.show(truncate=False)

+-------+-----------------------------------+
|name   |split(subjects, ,, -1)             |
+-------+-----------------------------------+
|Alice  |[Math, English, History]           |
|Bob    |[Physics, Chemistry]               |
|Charlie|[Biology]                          |
|David  |[Math, Physics, Chemistry, History]|
+-------+-----------------------------------+



In [0]:
df3=df2.select("name",explode("split(subjects, ,, -1)"))

In [0]:
df3.show(truncate=False)

+-------+---------+
|name   |col      |
+-------+---------+
|Alice  |Math     |
|Alice  |English  |
|Alice  |History  |
|Bob    |Physics  |
|Bob    |Chemistry|
|Charlie|Biology  |
|David  |Math     |
|David  |Physics  |
|David  |Chemistry|
|David  |History  |
+-------+---------+

