In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('stopwords').getOrCreate()

In [2]:
from pyspark import SparkFiles

spark.sparkContext.addFile("https://s3.amazonaws.com/zepl-trilogy-test/food_reviews.csv")
df = spark.read.csv(SparkFiles.get("food_reviews.csv"), sep=",", header=True)

In [3]:
df.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                             Reviews|
+----------------------------------------------------------------------------------------------------+
|                                                                          The pasta was a dish I had|
|                                                                        We ate the fish it was tasty|
|                                                                My family did not like the food here|
|                                      The girl even tried to spread each half out to cover the roll.|
|this is his job and since it was probably the slowest time of the day I would at least expect him...|
|                            I'm always greeted by the  employees and they always seem eager to help.|
+------------------------------------------------------------------------

In [4]:
from pyspark.ml.feature import Tokenizer

# Tokenize dataframe
review_data = Tokenizer(inputCol="Reviews", outputCol="Words")
reviewed = review_data.transform(df)
reviewed.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                           Reviews|                                             Words|
+--------------------------------------------------+--------------------------------------------------+
|                        The pasta was a dish I had|                [the, pasta, was, a, dish, i, had]|
|                      We ate the fish it was tasty|              [we, ate, the, fish, it, was, tasty]|
|              My family did not like the food here|     [my, family, did, not, like, the, food, here]|
|The girl even tried to spread each half out to ...|[the, girl, even, tried, to, spread, each, half...|
|this is his job and since it was probably the s...|[this, is, his, job, and, since, it, was, proba...|
|I'm always greeted by the  employees and they a...|[i'm, always, greeted, by, the, , employees, an...|
+--------------------------------------------------+------------

In [5]:
from pyspark.ml.feature import StopWordsRemover

# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="Filtered")
newFrame = remover.transform(reviewed)
newFrame.show(truncate=33)

+---------------------------------+---------------------------------+---------------------------------+
|                          Reviews|                            Words|                         Filtered|
+---------------------------------+---------------------------------+---------------------------------+
|       The pasta was a dish I had|[the, pasta, was, a, dish, i, ...|                    [pasta, dish]|
|     We ate the fish it was tasty|[we, ate, the, fish, it, was, ...|               [ate, fish, tasty]|
|My family did not like the foo...|[my, family, did, not, like, t...|             [family, like, food]|
|The girl even tried to spread ...|[the, girl, even, tried, to, s...|[girl, even, tried, spread, ha...|
|this is his job and since it w...|[this, is, his, job, and, sinc...|[job, since, probably, slowest...|
|I'm always greeted by the  emp...|[i'm, always, greeted, by, the...|[always, greeted, , employees,...|
+---------------------------------+-----------------------------

In [6]:
# Show simplified review
newFrame.select("filtered").show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                            filtered|
+----------------------------------------------------------------------------------------------------+
|                                                                                       [pasta, dish]|
|                                                                                  [ate, fish, tasty]|
|                                                                                [family, like, food]|
|                                                     [girl, even, tried, spread, half, cover, roll.]|
|[job, since, probably, slowest, time, day, least, expect, take, order, put, sandwich, through., c...|
|                                          [always, greeted, , employees, always, seem, eager, help.]|
+------------------------------------------------------------------------