In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark NLP demo") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Load data

In [2]:
twitter = spark.createDataFrame([
                                ('George is a spark expert', 'George', 1.0),
                                ('Jack is learning spark', 'Jack', 0.0)],
                                ['text', 'id', 'label']
                               )

In [8]:
twitter.show(truncate=False)

+------------------------+------+-----+
|text                    |id    |label|
+------------------------+------+-----+
|George is a spark expert|George|1.0  |
|Jack is learning spark  |Jack  |0.0  |
+------------------------+------+-----+



# Tokenizer

In [9]:
from pyspark.ml.feature import Tokenizer

In [10]:
tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')
twitter_tokens = tokenizer_mod.transform(twitter)
twitter_tokens.show(truncate=False)

+------------------------+------+-----+------------------------------+
|text                    |id    |label|tokens                        |
+------------------------+------+-----+------------------------------+
|George is a spark expert|George|1.0  |[george, is, a, spark, expert]|
|Jack is learning spark  |Jack  |0.0  |[jack, is, learning, spark]   |
+------------------------+------+-----+------------------------------+



# HashingTF

In [19]:
from pyspark.ml.feature import HashingTF
hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', \
                          outputCol='Features(vocab_size,[index],[tf]')
hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)

In [20]:
hashingTF_twitter.show(truncate=False)

+------------------------+------+-----+------------------------------+--------------------------------+
|text                    |id    |label|tokens                        |Features(vocab_size,[index],[tf]|
+------------------------+------+-----+------------------------------+--------------------------------+
|George is a spark expert|George|1.0  |[george, is, a, spark, expert]|(16,[1,2,3,9],[2.0,1.0,1.0,1.0])|
|Jack is learning spark  |Jack  |0.0  |[jack, is, learning, spark]   |(16,[0,1,3],[1.0,2.0,1.0])      |
+------------------------+------+-----+------------------------------+--------------------------------+



 You can not get the vocabulary. Since hashing is non-injective there is no inverse function. 

# CountVectorizer

In [31]:
from pyspark.ml.feature import CountVectorizer
count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')
countVectorizer_mod = count_vectorizer.fit(twitter_tokens)
countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)

In [32]:
countVectorizer_twitter.show(truncate=False)

+------------------------+------+-----+------------------------------+-------------------------------------+
|text                    |id    |label|tokens                        |features                             |
+------------------------+------+-----+------------------------------+-------------------------------------+
|George is a spark expert|George|1.0  |[george, is, a, spark, expert]|(7,[0,1,3,4,5],[1.0,1.0,1.0,1.0,1.0])|
|Jack is learning spark  |Jack  |0.0  |[jack, is, learning, spark]   |(7,[0,1,2,6],[1.0,1.0,1.0,1.0])      |
+------------------------+------+-----+------------------------------+-------------------------------------+



In [34]:
countVectorizer_mod.vocabulary

['is', 'spark', 'learning', 'a', 'expert', 'george', 'jack']

# Another good demo from Stackoverflow

In [27]:
## http://stackoverflow.com/questions/35205865/what-is-the-difference-between-hashingtf-and-countvectorizer-in-spark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import CountVectorizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")],
 ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="Features", numFeatures=100)
hashingTF_model = hashingTF.transform(wordsData)
print("Out of hashingTF function")
hashingTF_model.select('words',hashingTF_model.Features.alias('Features(vocab_size,[index],[tf])')).show(truncate=False)


# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="Features", vocabSize=20)

cv_model = cv.fit(wordsData)

cv_result = cv_model.transform(wordsData)
print("Out of CountVectorizer function")
cv_result.select('words',cv_result.Features.alias('Features(vocab_size,[index],[tf])')).show(truncate=False)
print("Vocabulary from CountVectorizerModel is \n" + str(cv_model.vocabulary))

Out of hashingTF function
+------------------------------------------+---------------------------------------------------------+
|words                                     |Features(vocab_size,[index],[tf])                        |
+------------------------------------------+---------------------------------------------------------+
|[hi, i, heard, about, spark]              |(100,[5,29,57,60,77],[1.0,1.0,1.0,1.0,1.0])              |
|[i, wish, java, could, use, case, classes]|(100,[9,13,29,42,67,89,95],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[logistic, regression, models, are, neat] |(100,[4,38,86,93,95],[1.0,1.0,1.0,1.0,1.0])              |
+------------------------------------------+---------------------------------------------------------+

Out of CountVectorizer function
+------------------------------------------+-----------------------------------------------------+
|words                                     |Features(vocab_size,[index],[tf])                    |
+---------------------

In [37]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

