<a href="https://colab.research.google.com/github/SREYAKUKUTAPU/Pyspark/blob/main/functions1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# explode() Function

In [23]:
#explode
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
# Initialize SparkSession
spark = SparkSession.builder.appName("show").getOrCreate()

In [24]:
data=[('abc',[1,2]),('mno',[3,4]),('xyz',[5,6])]
schema=StructType([StructField('id',StringType()),StructField('numbers',ArrayType(IntegerType()))])
df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()

+---+-------+
| id|numbers|
+---+-------+
|abc| [1, 2]|
|mno| [3, 4]|
|xyz| [5, 6]|
+---+-------+

root
 |-- id: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [25]:
df1=df.withColumn('explodeCol',explode(col('numbers'))).select('id','explodeCol')
df1.show()

+---+----------+
| id|explodeCol|
+---+----------+
|abc|         1|
|abc|         2|
|mno|         3|
|mno|         4|
|xyz|         5|
|xyz|         6|
+---+----------+



# split() function

In [26]:
data=[(1,'Sreya','Bigquery,python,GCP'),(2,'Srija','Java,angular,sql')]
schema=['id','name','skills']
df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()
df1=df.withColumn('skills',split(col('skills'),',')).select('id','name','skills')
df1.show(truncate=False)
df1.printSchema()

+---+-----+-------------------+
| id| name|             skills|
+---+-----+-------------------+
|  1|Sreya|Bigquery,python,GCP|
|  2|Srija|   Java,angular,sql|
+---+-----+-------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)

+---+-----+-----------------------+
|id |name |skills                 |
+---+-----+-----------------------+
|1  |Sreya|[Bigquery, python, GCP]|
|2  |Srija|[Java, angular, sql]   |
+---+-----+-----------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [27]:
data=[(1,'Sreya','Bigquery|python|GCP'),(2,'Srija','Java|angular|sql')]
schema=['id','name','skills']
df=spark.createDataFrame(data,schema)
df.show()
df.printSchema()
df1=df.withColumn('skills',split(col('skills'),'\|')).select('id','name','skills')
df1.show(truncate=False)
df1.printSchema()

+---+-----+-------------------+
| id| name|             skills|
+---+-----+-------------------+
|  1|Sreya|Bigquery|python|GCP|
|  2|Srija|   Java|angular|sql|
+---+-----+-------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)

+---+-----+-----------------------+
|id |name |skills                 |
+---+-----+-----------------------+
|1  |Sreya|[Bigquery, python, GCP]|
|2  |Srija|[Java, angular, sql]   |
+---+-----+-----------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = false)



# array() function

In [28]:
data=[(1,'Sreya','Bigquery','python'),(2,'Srija','Java','angular')]
schema=['id','name','primarySkill','secondarySkill']
df=spark.createDataFrame(data,schema)
df.show()

+---+-----+------------+--------------+
| id| name|primarySkill|secondarySkill|
+---+-----+------------+--------------+
|  1|Sreya|    Bigquery|        python|
|  2|Srija|        Java|       angular|
+---+-----+------------+--------------+



In [29]:
df1=df.withColumn('skills',array(col('primarySkill'),col('secondarySkill')))
df1.show(truncate=False)
df1.printSchema()

+---+-----+------------+--------------+------------------+
|id |name |primarySkill|secondarySkill|skills            |
+---+-----+------------+--------------+------------------+
|1  |Sreya|Bigquery    |python        |[Bigquery, python]|
|2  |Srija|Java        |angular       |[Java, angular]   |
+---+-----+------------+--------------+------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- primarySkill: string (nullable = true)
 |-- secondarySkill: string (nullable = true)
 |-- skills: array (nullable = false)
 |    |-- element: string (containsNull = true)



# **array_contains**() fun

In [30]:
data=[(1,'Sreya',['Bigquery','python']),(2,'Srija',['Java','angular']),(3,'Minny',[])]
schema=['id','name','skills']
df=spark.createDataFrame(data,schema)
df1=df.withColumn('HasJavaSkill',array_contains(col('skills'),'Java'))
df1.show(truncate=False)

+---+-----+------------------+------------+
|id |name |skills            |HasJavaSkill|
+---+-----+------------------+------------+
|1  |Sreya|[Bigquery, python]|false       |
|2  |Srija|[Java, angular]   |true        |
|3  |Minny|[]                |false       |
+---+-----+------------------+------------+

