In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('NLP_tools').getOrCreate()

In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [6]:
# col -> allows you to call a column
# udf -> user-defined functions
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [8]:
sen_df = spark.createDataFrame([
    (0, 'Hi, I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,models,are,neat')
], 
    # pass in the column names
    ['id', 'sentence'])

In [9]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi, I heard about...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [10]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [13]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words',
                                pattern='\\W')

In [15]:
# user defined functions must specify a return type
count_tokens = udf(lambda words: len(words), IntegerType())

In [16]:
tokenized = tokenizer.transform(sen_df)

In [17]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi, I heard about...|[hi,, i, heard, a...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [18]:
# use withColumn to add a column by name
# use col to access words column directly
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

# this version splits on whitespace which is not necessarily what we want

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi, I heard about...|[hi,, i, heard, a...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [19]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [20]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi, I heard about...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



***

In [21]:
from pyspark.ml.feature import StopWordsRemover

In [22]:
sentenceDataFrame = spark.createDataFrame([
    (0, ['I', 'saw', 'the', 'green', 'horse']),
    (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id', 'tokens'])

In [23]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

remover.transform(sentenceDataFrame).show()

***

In [25]:
from pyspark.ml.feature import NGram

In [26]:
words_df = spark.createDataFrame([
    (0, ['Hello', 'I', 'heard', 'about', 'Spark']),
    (1, ['I', 'wish', 'Java', 'could', 'use', 'case', 'classes']),
    (2, ['Logistic','regression','models','are','neat'])
], 
    # pass in the column names
    ['id', 'words'])

In [27]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [29]:
# useful when you want to find the relationships between words
ngram.transform(words_df).show(truncate=False)

+---+------------------------------------------+------------------------------------------------------------------+
|id |words                                     |grams                                                             |
+---+------------------------------------------+------------------------------------------------------------------+
|0  |[Hello, I, heard, about, Spark]           |[Hello I, I heard, heard about, about Spark]                      |
|1  |[I, wish, Java, could, use, case, classes]|[I wish, wish Java, Java could, could use, use case, case classes]|
|2  |[Logistic, regression, models, are, neat] |[Logistic regression, regression models, models are, are neat]    |
+---+------------------------------------------+------------------------------------------------------------------+



***

In [31]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [32]:
sen_data = spark.createDataFrame([
    (0, 'Hi, I heard about Spark'),
    (0, 'I wish java could use case classes'),
    (1, 'Logistic regression models are neat')
], 
    # pass in the column names
    ['label', 'sentence'])

In [33]:
tokenizer2 = Tokenizer(inputCol='sentence', outputCol='words')

In [35]:
words_data = tokenizer.transform(sen_data)

In [37]:
words_data.show(truncate=False)

+-----+-----------------------------------+------------------------------------------+
|label|sentence                           |words                                     |
+-----+-----------------------------------+------------------------------------------+
|0    |Hi, I heard about Spark            |[hi,, i, heard, about, spark]             |
|0    |I wish java could use case classes |[i, wish, java, could, use, case, classes]|
|1    |Logistic regression models are neat|[logistic, regression, models, are, neat] |
+-----+-----------------------------------+------------------------------------------+



In [38]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [39]:
featurized_data = hashing_tf.transform(words_data)

In [40]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [41]:
idf_model = idf.fit(featurized_data)

In [42]:
rescaled_data = idf_model.transform(featurized_data)

In [44]:
rescaled_data.select('label', 'features').show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0    |(262144,[24417,73197,83094,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0    |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

***

In [45]:
from pyspark.ml.feature import CountVectorizer

In [46]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, " a b b c a".split(" "))
], ['id', 'words'])

In [47]:
df.show()

+---+-----------------+
| id|            words|
+---+-----------------+
|  0|        [a, b, c]|
|  1|[, a, b, b, c, a]|
+---+-----------------+



In [48]:
# minDF -> optional, specifies min number of documents the term must appear in
#  useful when you have a bunch of esoteric words
cv = CountVectorizer(inputCol='words', outputCol='features', 
                    vocabSize=3, minDF=2.0)

In [49]:
model = cv.fit(df)

In [50]:
result = model.transform(df)

In [51]:
result.show(truncate=False)

+---+-----------------+-------------------------+
|id |words            |features                 |
+---+-----------------+-------------------------+
|0  |[a, b, c]        |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[, a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+-----------------+-------------------------+

