In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('hashing').getOrCreate()

In [2]:
# Sample dataframe with repeating words
df = spark.createDataFrame([
    (0, "The cow cow jumped and jumped cow"),
    (1, "then the cow said"),
    (2, "I am a cow that jumped")
],["id", "words"])

df.show(truncate=False)

+---+---------------------------------+
|id |words                            |
+---+---------------------------------+
|0  |The cow cow jumped and jumped cow|
|1  |then the cow said                |
|2  |I am a cow that jumped           |
+---+---------------------------------+



In [3]:
from pyspark.ml.feature import Tokenizer

# Tokenize
tokenizer = Tokenizer(inputCol="words", outputCol="tokens")
tokened_df = tokenizer.transform(df)
tokened_df.show(truncate=False)

+---+---------------------------------+-----------------------------------------+
|id |words                            |tokens                                   |
+---+---------------------------------+-----------------------------------------+
|0  |The cow cow jumped and jumped cow|[the, cow, cow, jumped, and, jumped, cow]|
|1  |then the cow said                |[then, the, cow, said]                   |
|2  |I am a cow that jumped           |[i, am, a, cow, that, jumped]            |
+---+---------------------------------+-----------------------------------------+



In [4]:
from pyspark.ml.feature import StopWordsRemover

# Remove stopwords
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
stripped_df = remover.transform(tokened_df)
stripped_df.show(truncate=False)

+---+---------------------------------+-----------------------------------------+-------------------------------+
|id |words                            |tokens                                   |filtered                       |
+---+---------------------------------+-----------------------------------------+-------------------------------+
|0  |The cow cow jumped and jumped cow|[the, cow, cow, jumped, and, jumped, cow]|[cow, cow, jumped, jumped, cow]|
|1  |then the cow said                |[then, the, cow, said]                   |[cow, said]                    |
|2  |I am a cow that jumped           |[i, am, a, cow, that, jumped]            |[cow, jumped]                  |
+---+---------------------------------+-----------------------------------------+-------------------------------+



In [5]:
from pyspark.ml.feature import HashingTF

# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashed", numFeatures=pow(2,4))
hashed_df = hashing.transform(stripped_df)
hashed_df.select('filtered', 'hashed').show(truncate=False)

+-------------------------------+----------------------+
|filtered                       |hashed                |
+-------------------------------+----------------------+
|[cow, cow, jumped, jumped, cow]|(16,[11,15],[2.0,3.0])|
|[cow, said]                    |(16,[0,15],[1.0,1.0]) |
|[cow, jumped]                  |(16,[11,15],[1.0,1.0])|
+-------------------------------+----------------------+



In [6]:
from pyspark.ml.feature import IDF

# Fit the IDF on the data set 
idf = IDF(inputCol="hashed", outputCol="features")
idf_model = idf.fit(hashed_df)
idf_df = idf_model.transform(hashed_df)
idf_df.select('words', 'features').show(truncate=False)

+---------------------------------+--------------------------------------+
|words                            |features                              |
+---------------------------------+--------------------------------------+
|The cow cow jumped and jumped cow|(16,[11,15],[0.5753641449035617,0.0]) |
|then the cow said                |(16,[0,15],[0.6931471805599453,0.0])  |
|I am a cow that jumped           |(16,[11,15],[0.28768207245178085,0.0])|
+---------------------------------+--------------------------------------+

