In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("SparkMLLib_TFIDF").getOrCreate()

24/11/15 14:30:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[6,8,13,16],[...|
|  0.0|(20,[0,2,7,13,15,...|
|  1.0|(20,[3,4,6,11,19]...|
+-----+--------------------+



### Dau tien tao DataFrame sentenceData chua cac label va cau. Tokenizer la 1 lop trong PySpark de phan tach chuoi van ban thanh cac tu rieng biet, sau do lay du lieu dau vao la cot "sentence" cua DataFrame sau do tien hanh phan tach va luu vao cot "words" wordsData
### HashingTF la ky thuat anh xa cac tu thanh cac chi so trong khong gian vector co kich thuoc co dinh, thong qua mot ham bam. Lay du lieu tu cot "words" da tach o tren, voi so chieu cua vector la 20 thong qua tham so numFeatures. Sau do ap dung HashingTF len du lieu sau do luu vao featurizedData
### IDF la ky thuat giup giam trong so cua cac tu xuat hien pho bien trong bo du lieu va tang trong so cua nhung tu it xuat hien de cai thien features, sau do ap dung len du lieu sau khi hashing. Sau do ap dung mo hinh len bo du lieu featurizedData. Cuoi cung la in ra gia tri cua cac cot "label" va "features" sau khi da ap dung IDF