In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

In [6]:
documentDF = spark.createDataFrame([
    ("Word2Vec is an Estimator which takes sequences of words representing".split(" "), ),
    ("The Word2VecModel transforms each document into a vector using".split(" "), ),
    ("his vector can then be used as features for prediction".split(" "), )
], ["text"])  #创建测试DataFrame

In [7]:
from pyspark.ml.feature import Word2Vec

In [8]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")

In [9]:
model = word2Vec.fit(documentDF) 

In [10]:
result = model.transform(documentDF) 

In [13]:
for row in result.collect():                #查看转换的特征
    text, vector = row
print(u"文本: [%s] => \n特征向量: %s\n" % (", ".join(text), str(vector)))


文本: [his, vector, can, then, be, used, as, features, for, prediction] => 
特征向量: [0.0018979262560606003,-0.034886953281238677,-0.03560755136422813]



In [15]:
from pyspark.ml.feature import FeatureHasher
dataset = spark.createDataFrame([
    (2.2, True, "1", "foo"),
    (3.3, False, "2", "bar"),
    (4.4, False, "3", "baz"),
    (5.5, False, "4", "foo")
], ["real", "bool", "stringNum", "string"])
hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],outputCol="features")

In [16]:
featurized = hasher.transform(dataset)
featurized.show(truncate=False)

+----+-----+---------+------+--------------------------------------------------------+
|real|bool |stringNum|string|features                                                |
+----+-----+---------+------+--------------------------------------------------------+
|2.2 |true |1        |foo   |(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])|
|3.3 |false|2        |bar   |(262144,[70644,89673,173866,174475],[1.0,1.0,1.0,3.3])  |
|4.4 |false|3        |baz   |(262144,[22406,70644,174475,187923],[1.0,1.0,4.4,1.0])  |
|5.5 |false|4        |foo   |(262144,[70644,101499,174475,257907],[1.0,1.0,5.5,1.0]) |
+----+-----+---------+------+--------------------------------------------------------+



In [23]:
from pyspark.ml import Transformer
from pyspark.ml import Model
from pyspark.ml import Estimator
isinstance(hasher,Transformer)

False

In [24]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [25]:
indexed.printSchema()

root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- categoryIndex: double (nullable = false)



In [26]:
isinstance(indexer,Estimator)

True

In [27]:
from pyspark.ml.feature import IndexToString, StringIndexer

In [28]:
df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

In [29]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

In [30]:
print("Transformed string column '%s' to indexed column '%s'"
      % (indexer.getInputCol(), indexer.getOutputCol()))
indexed.show()

Transformed string column 'category' to indexed column 'categoryIndex'
+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [31]:
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)


In [32]:
print("Transformed indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()

Transformed indexed column 'categoryIndex' back to original string column 'originalCategory' using labels in metadata
+---+-------------+----------------+
| id|categoryIndex|originalCategory|
+---+-------------+----------------+
|  0|          0.0|               a|
|  1|          2.0|               b|
|  2|          1.0|               c|
|  3|          0.0|               a|
|  4|          0.0|               a|
|  5|          1.0|               c|
+---+-------------+----------------+



In [33]:
converted.printSchema()

root
 |-- id: long (nullable = true)
 |-- category: string (nullable = true)
 |-- categoryIndex: double (nullable = false)
 |-- originalCategory: string (nullable = true)



In [37]:
isinstance(converter,Transformer)

True

In [38]:
from pyspark.ml.feature import OneHotEncoderEstimator

In [49]:
df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (3.0, 2.0),
    (2.0, 1.0),
    (3.0, 0.0)
], ["category1", "category2"])

In [50]:
encoder = OneHotEncoderEstimator(inputCols=["category1", "category2"],outputCols=["Vec1", "Vec2"])

In [51]:
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+---------+---------+-------------+-------------+
|category1|category2|         Vec1|         Vec2|
+---------+---------+-------------+-------------+
|      0.0|      1.0|(3,[0],[1.0])|(2,[1],[1.0])|
|      1.0|      0.0|(3,[1],[1.0])|(2,[0],[1.0])|
|      2.0|      1.0|(3,[2],[1.0])|(2,[1],[1.0])|
|      3.0|      2.0|    (3,[],[])|    (2,[],[])|
|      2.0|      1.0|(3,[2],[1.0])|(2,[1],[1.0])|
|      3.0|      0.0|    (3,[],[])|(2,[0],[1.0])|
+---------+---------+-------------+-------------+



In [66]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors


In [75]:
df = spark.createDataFrame([
    (1, Vectors.dense([2.0, 3.0, 10.0, 1.0]), 1.0,),
    (2, Vectors.dense([0.0, 1.0, 7.0, 0.0]), 0.0,),
    (3, Vectors.dense([1.0, 2.0, 3.0, 0.1]), 0.0,)], ["id", "features", "resulted"])


In [76]:
selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="resulted")

In [77]:
result = selector.fit(df).transform(df)

In [78]:
print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 1 features selected
+---+------------------+--------+----------------+
| id|          features|resulted|selectedFeatures|
+---+------------------+--------+----------------+
|  1|[2.0,3.0,10.0,1.0]|     1.0|           [2.0]|
|  2| [0.0,1.0,7.0,0.0]|     0.0|           [0.0]|
|  3| [1.0,2.0,3.0,0.1]|     0.0|           [1.0]|
+---+------------------+--------+----------------+



In [79]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

In [80]:
df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

In [81]:
slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

In [82]:
output = slicer.transform(df)

In [83]:
output.select("userFeatures", "features").show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(1,[0],[2.3])|
|      [-2.0,2.3,0.0]|        [2.3]|
+--------------------+-------------+



In [84]:
df = spark.createDataFrame([(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
                            (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
                            (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])

In [85]:
vs=VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])

In [90]:
data=vs.transform(df)

In [95]:
data.show(truncate=False)

+-----------------------+----------+
|features               |sliced    |
+-----------------------+----------+
|[-2.0,2.3,0.0,0.0,1.0] |[2.3,1.0] |
|[0.0,0.0,0.0,0.0,0.0]  |[0.0,0.0] |
|[0.6,-1.1,-3.0,4.5,3.3]|[-1.1,3.3]|
+-----------------------+----------+



In [99]:
from pyspark.ml.feature import RFormula
df = spark.createDataFrame([(1.0, 1.0, "a"),(0.0, 2.0, "b"),(0.0, 0.0, "a")], ["y", "x", "s"])

In [100]:
rf = RFormula(formula="y ~ x + s")
data=model = rf.fit(df).transform(df)
data.show()

+---+---+---+---------+-----+
|  y|  x|  s| features|label|
+---+---+---+---------+-----+
|1.0|1.0|  a|[1.0,1.0]|  1.0|
|0.0|2.0|  b|[2.0,0.0]|  0.0|
|0.0|0.0|  a|[0.0,1.0]|  0.0|
+---+---+---+---------+-----+

