# Feature Extractors

# 目录
- TF-IDF
- Word2Vec
- CountVectorizer
- FeatureHasher
- 参考

In [8]:
sc

In [None]:
spark

# TF-IDF

# Word2Vec

## 示例1

In [3]:
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.08242294350638986,-0.005830296874046326,-0.05622698366641998]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.03722663476530994,-0.0021124311855861117,-0.013124272493379456]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.08365558385848999,0.00822269544005394,-0.040985722094774246]



## 示例2

In [16]:
from pyspark.ml.feature import Word2VecModel

temp_path='.'

sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
model = word2Vec.fit(doc)
model.getVectors().show()
model.findSynonymsArray("a", 2)

from pyspark.sql.functions import format_number as fmt

model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show()
model.transform(doc).head().model
word2vecPath = temp_path + "/word2vec"
word2Vec.save(word2vecPath)

+----+--------------------+
|word|              vector|
+----+--------------------+
|   a|[0.09461779892444...|
|   b|[1.15474212169647...|
|   c|[-0.3794820010662...|
+----+--------------------+

+----+----------+
|word|similarity|
+----+----------+
|   b|   0.25053|
|   c|  -0.69805|
+----+----------+



In [17]:
loadedWord2Vec = Word2Vec.load(word2vecPath)
loadedWord2Vec.getVectorSize() == word2Vec.getVectorSize()
loadedWord2Vec.getNumPartitions() == word2Vec.getNumPartitions()
loadedWord2Vec.getMinCount() == word2Vec.getMinCount()
modelPath = temp_path + "/word2vec-model"
model.save(modelPath)
loadedModel = Word2VecModel.load(modelPath)
loadedModel.getVectors().first().word == model.getVectors().first().word
loadedModel.getVectors().first().vector == model.getVectors().first().vector

True

# CountVectorizer

# FeatureHasher

# 参考

- [ml-features](http://spark.apache.org/docs/latest/ml-features.html)
- [word2vec](http://spark.apache.org/docs/latest/ml-features.html#word2vec)
- [word2vec API](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)