In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').master('local').getOrCreate()

# Tokenizer

In [2]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [3]:
sentenceDataFrame = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish Java could use case classes'),
    (2, 'Logistic,regression,models,are,neat')
], ['id', 'sentence'])

In [4]:
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [5]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [6]:
countTokens = udf(lambda words: len(words), IntegerType())

In [8]:
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
+-----------------------------------+------------------------------------------+------+



In [9]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |5     |
+-----------------------------------+------------------------------------------+------+



# Stop Words Removal

In [10]:
from pyspark.ml.feature import StopWordsRemover

In [12]:
sentenceData = spark.createDataFrame([
    (0, ['I', 'saw', 'the', 'red', 'balloon']),
    (1, ['Mary','had', 'a', 'little', 'lamb'])
], ['id', 'raw'])

In [13]:
remover = StopWordsRemover(inputCol='raw', outputCol='filtered')
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



# n-grams

In [16]:
from pyspark.ml.feature import NGram
wordDataFrame = spark.createDataFrame([
    (0, ['hi', 'i', 'heard', 'about', 'spark'] ),
    (1, ['I', 'wish', 'Java', 'could', 'use', 'case', 'classes']),
    (2, ['Logistic','regression','models','are','neat'])
], ['id', 'words'])

In [17]:
ngram = NGram(n=2, inputCol='words', outputCol='ngrams')
ngramDataFrame = ngram.transform(wordDataFrame)

In [18]:
ngramDataFrame.select('ngrams').show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[hi i, i heard, heard about, about spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



# TF-IDF

In [19]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
sentenceData = spark.createDataFrame([
    (0.0, 'Hi I heard about Spark'),
    (0.0, 'I wish Java could use case classes'),
    (1.0, 'Logistic,regression,models,are,neat')
], ['label', 'sentence'])
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic,regressi...|
+-----+--------------------+



In [20]:
tockenizer = Tokenizer(inputCol='sentence', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic,regressi...|[logistic,regress...|
+-----+--------------------+--------------------+



In [21]:
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=20)
feturizedData = hashingTF.transform(wordsData)

In [22]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(feturizedData)
rescaledData = idfModel.transform(feturizedData)
rescaledData.select('label', 'features').show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[14],[0.69314...|
+-----+--------------------+



In [23]:
from pyspark.ml.feature import CountVectorizer
df = spark.createDataFrame([
    (0, 'a b c'.split(' ')),
    (1, 'a b b c a'.split(' '))
], ['id', 'words'])

In [24]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [25]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)
model = cv.fit(df)
result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

