# SPLIT()

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-split")
    .getOrCreate()
)

In [2]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



## Split Column using withColumn()

In [3]:
df1 = df.withColumn('year', split(df['dob'], '-').getItem(0)) \
        .withColumn('month', split(df['dob'], '-').getItem(1)) \
        .withColumn('day', split(df['dob'], '-').getItem(2))

df1.show(truncate=False)


+---------+----------+--------+----------+------+------+----+-----+---+
|firstname|middlename|lastname|dob       |gender|salary|year|month|day|
+---------+----------+--------+----------+------+------+----+-----+---+
|James    |          |Smith   |1991-04-01|M     |3000  |1991|04   |01 |
|Michael  |Rose      |        |2000-05-19|M     |4000  |2000|05   |19 |
|Robert   |          |Williams|1978-09-05|M     |4000  |1978|09   |05 |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |1967|12   |01 |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |1980|02   |17 |
+---------+----------+--------+----------+------+------+----+-----+---+



In [5]:
# Alternatively, you can do like below by creating a function variable and reusing it.
split_col = pyspark.sql.functions.split(df['dob'], '-')

df2 = df.withColumn('year', split_col.getItem(0)) \
        .withColumn('month', split_col.getItem(1)) \
        .withColumn('day', split_col.getItem(2))

df2.show(truncate=False) 


+---------+----------+--------+----------+------+------+----+-----+---+
|firstname|middlename|lastname|dob       |gender|salary|year|month|day|
+---------+----------+--------+----------+------+------+----+-----+---+
|James    |          |Smith   |1991-04-01|M     |3000  |1991|04   |01 |
|Michael  |Rose      |        |2000-05-19|M     |4000  |2000|05   |19 |
|Robert   |          |Williams|1978-09-05|M     |4000  |1978|09   |05 |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |1967|12   |01 |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |1980|02   |17 |
+---------+----------+--------+----------+------+------+----+-----+---+



## Split Column using Select()

In [7]:
split_col = pyspark.sql.functions.split(df['dob'], '-')

df3 = df.select("firstname","middlename","lastname","dob", 
                split_col.getItem(0).alias('year'),
                split_col.getItem(1).alias('month'),
                split_col.getItem(2).alias('day'))  
                 
df3.show(truncate=False)


+---------+----------+--------+----------+----+-----+---+
|firstname|middlename|lastname|dob       |year|month|day|
+---------+----------+--------+----------+----+-----+---+
|James    |          |Smith   |1991-04-01|1991|04   |01 |
|Michael  |Rose      |        |2000-05-19|2000|05   |19 |
|Robert   |          |Williams|1978-09-05|1978|09   |05 |
|Maria    |Anne      |Jones   |1967-12-01|1967|12   |01 |
|Jen      |Mary      |Brown   |1980-02-17|1980|02   |17 |
+---------+----------+--------+----------+----+-----+---+



## Split with Regular Expression

In [8]:
df4 = spark.createDataFrame([('oneAtwoBthree',)], ['str',])

df4.select(split(df4.str, '[AB]').alias('str')).show()


+-----------------+
|              str|
+-----------------+
|[one, two, three]|
+-----------------+



## Split Using limit Argument

Let’s see an example using limit option on split.

limit > 0: The resulting array’s length will not be more than `limit`, and the resulting array’s last entry will contain all input beyond the last matched pattern.

limit <= 0 will be applied as many times as possible, and the resulting array can be of any size.

In [9]:
df4.select(split(df4.str, '[AB]', 2).alias('str')).show()

+----------------+
|             str|
+----------------+
|[one, twoBthree]|
+----------------+



In [10]:

df4.select(split(df4.str, '[AB]',3).alias('str')).show()

+-----------------+
|              str|
+-----------------+
|[one, two, three]|
+-----------------+



The code block displayed below contains an error. The code block below is intended to add a column itemNameElements to DataFrame itemsDf that includes an array of all words in column itemName. Find the error.
>
Sample of DataFrame itemsDf:
>
- +——————+—————————————————————————————————-+——————————————————-+
- |itemId|itemName                          |supplier           |
- +——————+—————————————————————————————————-+——————————————————-+
- |1     |Thick Coat for Walking in the Snow|Sports Company Inc.|
- |2     |Elegant Outdoors Summer Dress     |YetiX              |
- |3     |Outdoors Backpack                 |Sports Company Inc.|
- +——————+—————————————————————————————————-+——————————————————-+
>
Code block:

- `itemsDf.withColumnRenamed(“itemNameElements”, split(“itemName”))`

In [2]:
data = [(1, 'Thick Coat for Walking in the Snow', 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', 'YetiX'),
        (3, 'Outdoors Backpack', 'Sports Company Inc.')]

columns = ["itemId", "itemName", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema = columns)

In [36]:
itemsDf.withColumn("itemNameElements", split("itemName", "")).show()

+------+--------------------+-------------------+--------------------+
|itemId|            itemName|           supplier|    itemNameElements|
+------+--------------------+-------------------+--------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|[T, h, i, c, k,  ...|
|     2|Elegant Outdoors ...|              YetiX|[E, l, e, g, a, n...|
|     3|   Outdoors Backpack|Sports Company Inc.|[O, u, t, d, o, o...|
+------+--------------------+-------------------+--------------------+



In [4]:
df1, df2 = df.randomSplit([0.70, 0.30])

In [5]:
df1.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [7]:
df2.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
+---------+----------+--------+----------+------+------+



In [12]:
df = spark.read.text("../files/sample.txt")
df.printSchema()
df.show()


root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|RED WOOLLY HOTTIE...|
+--------------------+



In [13]:
df1 = df.select(split(col("value"), " ").alias("values"))
df1.printSchema()
df1.show()

root
 |-- values: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+
|              values|
+--------------------+
|[WHITE, HANGING, ...|
|[, WHITE, METAL, ...|
|[RED, WOOLLY, HOT...|
+--------------------+

