    EXPLODE OPERATIONS IN PYSPARK

In [24]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode,explode_outer,posexplode,posexplode_outer

In [4]:
spark=SparkSession.builder.appName("explode tutorial.com").getOrCreate()

data=[
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})
]
df = spark.createDataFrame(data=data, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show()

24/09/09 16:38:41 WARN Utils: Your hostname, AI-CJB-LAP-459 resolves to a loopback address: 127.0.1.1; using 192.168.1.164 instead (on interface wlp0s20f3)
24/09/09 16:38:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/09 16:38:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/09 16:38:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



                                                                                

+----------+-------------------+--------------------+
|      name|     knownLanguages|          properties|
+----------+-------------------+--------------------+
|     James|      [Java, Scala]|{eye -> brown, ha...|
|   Michael|[Spark, Java, NULL]|{eye -> NULL, hai...|
|    Robert|         [CSharp, ]|{eye -> , hair ->...|
|Washington|               NULL|                NULL|
| Jefferson|             [1, 2]|                  {}|
+----------+-------------------+--------------------+



EXPLODE-ARRAY COLUMN 

In [6]:
df2=df.select(df.name,explode(df.knownLanguages))
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)

+---------+------+
|     name|   col|
+---------+------+
|    James|  Java|
|    James| Scala|
|  Michael| Spark|
|  Michael|  Java|
|  Michael|  NULL|
|   Robert|CSharp|
|   Robert|      |
|Jefferson|     1|
|Jefferson|     2|
+---------+------+



EXPLODE-MAP COLUMN

In [9]:
df3=df.select(df.name,explode(df.properties),explode(df.knownLanguages))
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)
 |-- col: string (nullable = true)

+-------+----+-----+------+
|   name| key|value|   col|
+-------+----+-----+------+
|  James| eye|brown|  Java|
|  James| eye|brown| Scala|
|  James|hair|black|  Java|
|  James|hair|black| Scala|
|Michael| eye| NULL| Spark|
|Michael| eye| NULL|  Java|
|Michael| eye| NULL|  NULL|
|Michael|hair|brown| Spark|
|Michael|hair|brown|  Java|
|Michael|hair|brown|  NULL|
| Robert| eye|     |CSharp|
| Robert| eye|     |      |
| Robert|hair|  red|CSharp|
| Robert|hair|  red|      |
+-------+----+-----+------+



EXPLODE-OUTER COLUMN

In [16]:
df.select(df.name,explode_outer(df.knownLanguages),explode_outer(df.properties)).show()

+----------+------+----+-----+
|      name|   col| key|value|
+----------+------+----+-----+
|     James|  Java| eye|brown|
|     James|  Java|hair|black|
|     James| Scala| eye|brown|
|     James| Scala|hair|black|
|   Michael| Spark| eye| NULL|
|   Michael| Spark|hair|brown|
|   Michael|  Java| eye| NULL|
|   Michael|  Java|hair|brown|
|   Michael|  NULL| eye| NULL|
|   Michael|  NULL|hair|brown|
|    Robert|CSharp| eye|     |
|    Robert|CSharp|hair|  red|
|    Robert|      | eye|     |
|    Robert|      |hair|  red|
|Washington|  NULL|NULL| NULL|
| Jefferson|     1|NULL| NULL|
| Jefferson|     2|NULL| NULL|
+----------+------+----+-----+



POSEXPLODE-EXPLODE ARRAY OR MAP ELEMENTS TO ROWS

In [30]:
df.select(df.name,posexplode(df.knownLanguages).alias("position","languages_known")).show()
df.select(df.name,posexplode(df.properties).alias("position","properties_key","properties_values")).show()

+---------+--------+---------------+
|     name|position|languages_known|
+---------+--------+---------------+
|    James|       0|           Java|
|    James|       1|          Scala|
|  Michael|       0|          Spark|
|  Michael|       1|           Java|
|  Michael|       2|           NULL|
|   Robert|       0|         CSharp|
|   Robert|       1|               |
|Jefferson|       0|              1|
|Jefferson|       1|              2|
+---------+--------+---------------+

+-------+--------+--------------+-----------------+
|   name|position|properties_key|properties_values|
+-------+--------+--------------+-----------------+
|  James|       0|           eye|            brown|
|  James|       1|          hair|            black|
|Michael|       0|           eye|             NULL|
|Michael|       1|          hair|            brown|
| Robert|       0|           eye|                 |
| Robert|       1|          hair|              red|
+-------+--------+--------------+-----------------

POSEXPLODE_OUTER -CREATE ROWS OF EACH ELEMENT IN AN ARRAY OR MAP

In [35]:
df.select(df.name,posexplode_outer(df.properties).alias("position","key","value")).show()

+----------+--------+----+-----+
|      name|position| key|value|
+----------+--------+----+-----+
|     James|       0| eye|brown|
|     James|       1|hair|black|
|   Michael|       0| eye| NULL|
|   Michael|       1|hair|brown|
|    Robert|       0| eye|     |
|    Robert|       1|hair|  red|
|Washington|    NULL|NULL| NULL|
| Jefferson|    NULL|NULL| NULL|
+----------+--------+----+-----+

