In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('tokenizer').getOrCreate()

In [2]:
from pyspark import SparkFiles

spark.sparkContext.addFile("../Resources/data.csv")
df = spark.read.csv(SparkFiles.get("data.csv"), sep=",", header=True)

In [3]:
# Show datframe
df.show(truncate=False)

+---------------------+
|Poem                 |
+---------------------+
|This Autumn midnight |
|Orion’s at my window |
|shouting for his dog.|
+---------------------+



In [4]:
from pyspark.ml.feature import Tokenizer

# Tokenize dataframe
tokener = Tokenizer(inputCol="Poem", outputCol="words")
tokenized = tokener.transform(df)
tokenized.show(truncate=False)

+---------------------+--------------------------+
|Poem                 |words                     |
+---------------------+--------------------------+
|This Autumn midnight |[this, autumn, midnight]  |
|Orion’s at my window |[orion’s, at, my, window] |
|shouting for his dog.|[shouting, for, his, dog.]|
+---------------------+--------------------------+



In [5]:
# Create function to count vowels
def vowel_counter(words):    
    vowel_count = 0
    
    for word in words:
        for vowel in word:
            if vowel in 'aeiou':
                vowel_count += 1

    return vowel_count

In [6]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

# Store a user defined function
count_vowels = udf(vowel_counter, IntegerType())

In [7]:
# Create a new dataframe with the udf
tokenized.select(
    "Poem", 
    "words"
).withColumn(
    "vowels", 
    count_vowels(col("words"))
).show(truncate=False)

+---------------------+--------------------------+------+
|Poem                 |words                     |vowels|
+---------------------+--------------------------+------+
|This Autumn midnight |[this, autumn, midnight]  |6     |
|Orion’s at my window |[orion’s, at, my, window] |6     |
|shouting for his dog.|[shouting, for, his, dog.]|6     |
+---------------------+--------------------------+------+

