In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [2]:
# Start spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.config("spark.driver.memory", "10g").getOrCreate()

24/07/30 09:23:39 WARN Utils: Your hostname, ubuntu20 resolves to a loopback address: 127.0.1.1; using 192.168.0.234 instead (on interface wlp0s20f3)
24/07/30 09:23:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/30 09:23:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/30 09:23:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [4]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [8]:
wordsData = tokenizer.transform(sentenceData)

In [10]:
wordsData.show(5)

                                                                                

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [11]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)

In [12]:
featurizedData = hashingTF.transform(wordsData)

In [14]:
featurizedData.show(5, truncate=False)

+-----+-----------------------------------+------------------------------------------+-----------------------------------------------+
|label|sentence                           |words                                     |rawFeatures                                    |
+-----+-----------------------------------+------------------------------------------+-----------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |(20,[6,8,13,16],[1.0,1.0,1.0,2.0])             |
|0.0  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|(20,[0,2,7,13,15,16],[1.0,1.0,2.0,1.0,1.0,1.0])|
|1.0  |Logistic regression models are neat|[logistic, regression, models, are, neat] |(20,[3,4,6,11,19],[1.0,1.0,1.0,1.0,1.0])       |
+-----+-----------------------------------+------------------------------------------+-----------------------------------------------+



In [15]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [16]:
idfModel = idf.fit(featurizedData)

                                                                                

In [17]:
rescaledData = idfModel.transform(featurizedData)

In [19]:
rescaledData.show(5, truncate=False)

+-----+-----------------------------------+------------------------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|label|sentence                           |words                                     |rawFeatures                                    |features                                                                                                                                   |
+-----+-----------------------------------+------------------------------------------+-----------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |(20,[6,8,13,16],[1.0,1.0,1.0,2.0])             |(20,[6,8,13,16],[0.28768207245178085,0.69

In [20]:
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[6,8,13,16],[...|
|  0.0|(20,[0,2,7,13,15,...|
|  1.0|(20,[3,4,6,11,19]...|
+-----+--------------------+

