In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP_Basic").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/16 11:32:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
sen_df = spark.createDataFrame(
    [
        (0, "Hi I heard about Spark"),
        (1, "i wish java could use case classes"),
        (2, "Logistic,regression,models,are,neat"),
    ],
    ["id", "sentence"],
)

In [5]:
sen_df.show()

                                                                                

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|i wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [7]:
regex_tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

In [8]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [9]:
tokenized = tokenizer.transform(sen_df)

In [11]:
tokenized.show(truncate=False)

+---+-----------------------------------+------------------------------------------+
|id |sentence                           |words                                     |
+---+-----------------------------------+------------------------------------------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |
|1  |i wish java could use case classes |[i, wish, java, could, use, case, classes]|
|2  |Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |
+---+-----------------------------------+------------------------------------------+



In [12]:
tokenized.withColumn("tokens", count_tokens(col("words"))).show()

                                                                                

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [13]:
reg_tokenized = regex_tokenizer.transform(sen_df)

In [14]:
reg_tokenized.withColumn("tokens", count_tokens(col("words"))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|i wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [15]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
sentenceData = spark.createDataFrame(
    [
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"]),
    ],
    ["id", "tokens"],
)

In [20]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [21]:
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |tokens                      |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [22]:
from pyspark.ml.feature import NGram

In [23]:
wordDataFrame = spark.createDataFrame(
    [
        (0, ["Hi", "I", "heard", "about", "Spark"]),
        (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
        (2, ["Logistic", "regression", "models", "are", "neat"]),
    ],
    ["id", "words"],
)

In [24]:
ngram = NGram(n=2, inputCol="words", outputCol="grams")

In [25]:
ngram.transform(wordDataFrame).select("grams").show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

