In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('udf').getOrCreate()

In [2]:
df = spark.createDataFrame([
    (0, "Mary had a little lamb"),
    (1, "It's fleece was white as snow"),
    (2, "And everywhere Mary went"),
    (3, "The lamb was sure to go")
], ["id", "Nursery Rhyme"])

In [3]:
df.show(truncate=False)

+---+-----------------------------+
|id |Nursery Rhyme                |
+---+-----------------------------+
|0  |Mary had a little lamb       |
|1  |It's fleece was white as snow|
|2  |And everywhere Mary went     |
|3  |The lamb was sure to go      |
+---+-----------------------------+



In [4]:
from pyspark.ml.feature import Tokenizer

# Tokenize words
tokenizer = Tokenizer(inputCol="Nursery Rhyme", outputCol="words")
tokenized = tokenizer.transform(df)

In [5]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [6]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

# Create a user defined function 
count_tokens = udf(word_list_length, IntegerType())

In [7]:
# Select the needed columns and don't truncate results
tokenized.select(
    "Nursery Rhyme", "words"
).withColumn(
    "tokens", count_tokens(col("words"))
).show(truncate=False)

+-----------------------------+------------------------------------+------+
|Nursery Rhyme                |words                               |tokens|
+-----------------------------+------------------------------------+------+
|Mary had a little lamb       |[mary, had, a, little, lamb]        |5     |
|It's fleece was white as snow|[it's, fleece, was, white, as, snow]|6     |
|And everywhere Mary went     |[and, everywhere, mary, went]       |4     |
|The lamb was sure to go      |[the, lamb, was, sure, to, go]      |6     |
+-----------------------------+------------------------------------+------+

