In [0]:
from pyspark.sql.functions import *

In [0]:
help(split)

Help on function split in module pyspark.sql.functions:

split(str: 'ColumnOrName', pattern: str, limit: int = -1) -> pyspark.sql.column.Column
    Splits str around matches of the given pattern.
    
    .. versionadded:: 1.5.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    str : :class:`~pyspark.sql.Column` or str
        a string expression to split
    pattern : str
        a string representing a regular expression. The regex string should be
        a Java regular expression.
    limit : int, optional
        an integer which controls the number of times `pattern` is applied.
    
        * ``limit > 0``: The resulting array's length will not be more than `limit`, and the
                         resulting array's last entry will contain all input beyond the last
                         matched pattern.
        * ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting
                      

In [0]:
help(explode)

Help on function explode in module pyspark.sql.functions:

explode(col: 'ColumnOrName') -> pyspark.sql.column.Column
    Returns a new row for each element in the given array or map.
    Uses the default column name `col` for elements in the array and
    `key` and `value` for elements in the map unless specified otherwise.
    
    .. versionadded:: 1.4.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or str
        target column to work on.
    
    Returns
    -------
    :class:`~pyspark.sql.Column`
        one row per array item or map key value.
    
    See Also
    --------
    :meth:`pyspark.functions.posexplode`
    :meth:`pyspark.functions.explode_outer`
    :meth:`pyspark.functions.posexplode_outer`
    
    Examples
    --------
    >>> from pyspark.sql import Row
    >>> eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
    >>> eDF.select(explode(eDF.i

In [0]:
l = [('X',)]

In [0]:
df = spark.createDataFrame(l,"dummy string")

In [0]:
df.show()

+-----+
|dummy|
+-----+
|    X|
+-----+



In [0]:
df.select(split(lit("Hello world, how are you")," ").alias("word")).show(truncate=False)

+------------------------------+
|word                          |
+------------------------------+
|[Hello, world,, how, are, you]|
+------------------------------+



In [0]:
df.select(explode(split(lit("Hello world, how are you")," ").alias('word'))).show()

+------+
|   col|
+------+
| Hello|
|world,|
|   how|
|   are|
|   you|
+------+



In [0]:
employees = [
    (1, "Scott", "Tiger", 1000.0, "united states", "+1 123 456 7890,+1 234 567 8901", "123 45 6789"),
    (2, "Henry", "Ford", 1250.0, "india", "+91 234 567 8901", "456 78 9123"),
    (3, "Nick", "Junior", 750.0, "united kingdom", "+44 111 111 111,+44 222 222 2222", "222 33 4444"),
    (4, "Bill", "Gomes", 1500.0, "australia", "+61 987 654 3210,+61 876 543 2109", "789 12 6118"),
]

In [0]:
empdf = spark.createDataFrame(employees, schema = """employee_id INT, first_name STRING,
                              last_name STRING, salary FLOAT, nationality STRING,
                              phone_number STRING, ssn STRING                              
                               """)

In [0]:
empdf.show(truncate=False)

+-----------+----------+---------+------+--------------+---------------------------------+-----------+
|employee_id|first_name|last_name|salary|nationality   |phone_number                     |ssn        |
+-----------+----------+---------+------+--------------+---------------------------------+-----------+
|1          |Scott     |Tiger    |1000.0|united states |+1 123 456 7890,+1 234 567 8901  |123 45 6789|
|2          |Henry     |Ford     |1250.0|india         |+91 234 567 8901                 |456 78 9123|
|3          |Nick      |Junior   |750.0 |united kingdom|+44 111 111 111,+44 222 222 2222 |222 33 4444|
|4          |Bill      |Gomes    |1500.0|australia     |+61 987 654 3210,+61 876 543 2109|789 12 6118|
+-----------+----------+---------+------+--------------+---------------------------------+-----------+



In [0]:
empdf.select('employee_id','phone_number').show(truncate = False)

+-----------+---------------------------------+
|employee_id|phone_number                     |
+-----------+---------------------------------+
|1          |+1 123 456 7890,+1 234 567 8901  |
|2          |+91 234 567 8901                 |
|3          |+44 111 111 111,+44 222 222 2222 |
|4          |+61 987 654 3210,+61 876 543 2109|
+-----------+---------------------------------+



In [0]:
empdf. \
    select('employee_id','phone_number','ssn'). \
    withColumn('phone_numbers', explode(split('phone_number',","))).show(truncate=False)

+-----------+---------------------------------+-----------+----------------+
|employee_id|phone_number                     |ssn        |phone_numbers   |
+-----------+---------------------------------+-----------+----------------+
|1          |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 123 456 7890 |
|1          |+1 123 456 7890,+1 234 567 8901  |123 45 6789|+1 234 567 8901 |
|2          |+91 234 567 8901                 |456 78 9123|+91 234 567 8901|
|3          |+44 111 111 111,+44 222 222 2222 |222 33 4444|+44 111 111 111 |
|3          |+44 111 111 111,+44 222 222 2222 |222 33 4444|+44 222 222 2222|
|4          |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 987 654 3210|
|4          |+61 987 654 3210,+61 876 543 2109|789 12 6118|+61 876 543 2109|
+-----------+---------------------------------+-----------+----------------+



In [0]:
empdf.\
    select('employee_id','phone_number','ssn'). \
    withColumn('phone_numbers', explode(split('phone_number',","))). \
    withColumn('area_code', split('phone_numbers'," ")[1].cast('int')). \
    withColumn('phone_last4', split('phone_numbers'," ")[3].cast('int')). \
    withColumn('ssn_last4', split('ssn'," ")[2].cast('int')).\
    show()

+-----------+--------------------+-----------+----------------+---------+-----------+---------+
|employee_id|        phone_number|        ssn|   phone_numbers|area_code|phone_last4|ssn_last4|
+-----------+--------------------+-----------+----------------+---------+-----------+---------+
|          1|+1 123 456 7890,+...|123 45 6789| +1 123 456 7890|      123|       7890|     6789|
|          1|+1 123 456 7890,+...|123 45 6789| +1 234 567 8901|      234|       8901|     6789|
|          2|    +91 234 567 8901|456 78 9123|+91 234 567 8901|      234|       8901|     9123|
|          3|+44 111 111 111,+...|222 33 4444| +44 111 111 111|      111|        111|     4444|
|          3|+44 111 111 111,+...|222 33 4444|+44 222 222 2222|      222|       2222|     4444|
|          4|+61 987 654 3210,...|789 12 6118|+61 987 654 3210|      987|       3210|     6118|
|          4|+61 987 654 3210,...|789 12 6118|+61 876 543 2109|      876|       2109|     6118|
+-----------+--------------------+------