<a href="https://colab.research.google.com/github/Vampaxx/Pyspark_basics/blob/main/ArrayType.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=a53e81191a4ff721de46ae2572702eac2cd79beddb8fe8262d8670f447868e56
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [28]:
from pyspark.sql.types import StringType,StructField,StructType,IntegerType,ArrayType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,array

In [3]:
spark = SparkSession.builder.appName('ArrayType').getOrCreate()

In [4]:
data = [('apple',[1,2]),\
        ('orange',[4,5]),\
        ('grapes',[7,8])]
schema = ['fruits','No_of_kgs']
df = spark.createDataFrame(data,schema)
df.show()
df.printSchema()

+------+---------+
|fruits|No_of_kgs|
+------+---------+
| apple|   [1, 2]|
|orange|   [4, 5]|
|grapes|   [7, 8]|
+------+---------+

root
 |-- fruits: string (nullable = true)
 |-- No_of_kgs: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [5]:
data = [('apple',[1,2]),\
        ('orange',[4,5]),\
        ('grapes',[7,8])]

schema = StructType([\
                     StructField('fruits',StringType()),\
                     StructField('No_of_kgs',ArrayType(IntegerType())) ]) #Interger present inside the array type

df = spark.createDataFrame(data,schema)
df.show()
df.printSchema()

+------+---------+
|fruits|No_of_kgs|
+------+---------+
| apple|   [1, 2]|
|orange|   [4, 5]|
|grapes|   [7, 8]|
+------+---------+

root
 |-- fruits: string (nullable = true)
 |-- No_of_kgs: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [6]:
df.withColumn('shop_1',col('No_of_kgs')[0]).show()
df.withColumn('shop_2',col('No_of_kgs')[1]).show()

+------+---------+------+
|fruits|No_of_kgs|shop_1|
+------+---------+------+
| apple|   [1, 2]|     1|
|orange|   [4, 5]|     4|
|grapes|   [7, 8]|     7|
+------+---------+------+

+------+---------+------+
|fruits|No_of_kgs|shop_2|
+------+---------+------+
| apple|   [1, 2]|     2|
|orange|   [4, 5]|     5|
|grapes|   [7, 8]|     8|
+------+---------+------+



In [34]:
from pyspark.sql.functions import array
## Combine multiple column and create new column

data = [('Apple',12),\
        ('Orange',24),\
        ('Grapes',2),\
        ('Banana',5)]

schema = ['fruits','No_of_kgs']
df = spark.createDataFrame(data,schema)
df.withColumn('Fruits_and_kgs',array(df.fruits,df.No_of_kgs)).show()
df.withColumn('Fruits_and_kgs',array(df.fruits,df.No_of_kgs)).printSchema()

## Another method

df.withColumn('Fruits_and_kgs',array(col('Fruits'),col('No_of_kgs'))).show()

+------+---------+--------------+
|fruits|No_of_kgs|Fruits_and_kgs|
+------+---------+--------------+
| Apple|       12|   [Apple, 12]|
|Orange|       24|  [Orange, 24]|
|Grapes|        2|   [Grapes, 2]|
|Banana|        5|   [Banana, 5]|
+------+---------+--------------+

root
 |-- fruits: string (nullable = true)
 |-- No_of_kgs: long (nullable = true)
 |-- Fruits_and_kgs: array (nullable = false)
 |    |-- element: string (containsNull = true)

+------+---------+--------------+
|fruits|No_of_kgs|Fruits_and_kgs|
+------+---------+--------------+
| Apple|       12|   [Apple, 12]|
|Orange|       24|  [Orange, 24]|
|Grapes|        2|   [Grapes, 2]|
|Banana|        5|   [Banana, 5]|
+------+---------+--------------+



In [35]:
data = [("John", ["apple", "banana", "cherry"]),
        ("Alice", ["orange", "strawberry"]),
        ("Bob", ["grape"])]

schema = ["name", "fruits"]
df = spark.createDataFrame(data, schema)
df.show()

+-----+--------------------+
| name|              fruits|
+-----+--------------------+
| John|[apple, banana, c...|
|Alice|[orange, strawberry]|
|  Bob|             [grape]|
+-----+--------------------+



In [36]:
df.select(col("name"), col("fruits")[0].alias("first_fruit")).show()

+-----+-----------+
| name|first_fruit|
+-----+-----------+
| John|      apple|
|Alice|     orange|
|  Bob|      grape|
+-----+-----------+



In [38]:
from pyspark.sql.functions import explode

df.select(col("name"), explode(col("fruits")).alias("fruit")).show()

+-----+----------+
| name|     fruit|
+-----+----------+
| John|     apple|
| John|    banana|
| John|    cherry|
|Alice|    orange|
|Alice|strawberry|
|  Bob|     grape|
+-----+----------+



In [40]:
from pyspark.sql.functions import array_contains

In [42]:
df.filter(array_contains(col("fruits"), "apple")).show()

+----+--------------------+
|name|              fruits|
+----+--------------------+
|John|[apple, banana, c...|
+----+--------------------+



8