* Local vectors

In [1]:
from pyspark.ml.linalg import Vectors
dv = Vectors.dense(1.0,0.0,3.0)
sv = Vectors.sparse(3,[0,2],[1.0,3.0])
print(dv)
print(sv)

[1.0,0.0,3.0]
(3,[0,2],[1.0,3.0])


* Labeled points

In [2]:
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
neg = LabeledPoint(0.0,[2.0,5.0,3.0])#the first value is a double-typed label
pos = LabeledPoint(1.0,SparseVector(3,[0,2],[1.0,3.0]))
print(neg)
print(pos)

(0.0,[2.0,5.0,3.0])
(1.0,(3,[0,2],[1.0,3.0]))


* local matrix

In [3]:
from pyspark.ml.linalg import Matrices

dm = Matrices.dense(3,2,[9,0,0,0,8,6])
sm = Matrices.sparse(3,2,[0,1,3],[0,2,1],[9,6,8])#(row,column,[indexs of row],[index of columns],[index of values])

print(dm)
print(sm)


DenseMatrix([[9., 0.],
             [0., 8.],
             [0., 6.]])
3 X 2 CSCMatrix
(0,0) 9.0
(2,1) 6.0
(1,1) 8.0


* Spare labeled data (LIBSVM format)

Each line's format: label index1:value1 index2:value2 ....
(index start from 1,but after loaded, index -1)

In [4]:
# myDF= spark.read.format("libsvm").load("sample_libsvm_data.txt")

* Pipeline for estimators

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
 (0, "a b c d e spark", 1.0),
 (1, "b d", 0.0),
 (2, "spark f g h", 1.0),
 (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [8]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
 (4, "spark i j k"),
 (5, "l m n"),
 (6, "spark hadoop spark"),
 (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
prediction.show()

+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
| id|              text|               words|            features|       rawPrediction|         probability|prediction|
+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  4|       spark i j k|    [spark, i, j, k]|(262144,[20197,24...|[-1.6609033227472...|[0.15964077387874...|       1.0|
|  5|             l m n|           [l, m, n]|(262144,[18910,10...|[1.64218895265644...|[0.83783256854767...|       0.0|
|  6|spark hadoop spark|[spark, hadoop, s...|(262144,[155117,2...|[-2.5980142174393...|[0.06926633132976...|       1.0|
|  7|     apache hadoop|    [apache, hadoop]|(262144,[66695,15...|[4.00817033336812...|[0.98215753334442...|       0.0|
+---+------------------+--------------------+--------------------+--------------------+--------------------+----------+



* Vector Assembler (a transformer)

In [18]:
from pyspark.sql import SparkSession
spark2 = SparkSession.builder.getOrCreate()
profilesList = [(1,4.5,True),(2,0.6,True),(3,1.5,False),(4,12.1,True),(5,0.0,True)]
inputDF = spark2.createDataFrame(profilesList,['colA','colB','colC'])
inputDF.show()

+----+----+-----+
|colA|colB| colC|
+----+----+-----+
|   1| 4.5| true|
|   2| 0.6| true|
|   3| 1.5|false|
|   4|12.1| true|
|   5| 0.0| true|
+----+----+-----+



In [19]:
from pyspark.ml.feature import VectorAssembler
feat_cols = ['colA', 'colB', 'colC']
vectorAssembler = VectorAssembler(inputCols = feat_cols,
outputCol = 'features')
transformedDF = vectorAssembler.transform(inputDF)
transformedDF.show()

+----+----+-----+--------------+
|colA|colB| colC|      features|
+----+----+-----+--------------+
|   1| 4.5| true| [1.0,4.5,1.0]|
|   2| 0.6| true| [2.0,0.6,1.0]|
|   3| 1.5|false| [3.0,1.5,0.0]|
|   4|12.1| true|[4.0,12.1,1.0]|
|   5| 0.0| true| [5.0,0.0,1.0]|
+----+----+-----+--------------+



* Standard scaler (Estimator)

transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation and/or zero mean. 

In [20]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol="scaledFeatures",
withStd=True, withMean=True)
# fitting the StandardScaler. Then Normalize each feature to have a unit standard deviation.
scalerModel = scaler.fit(transformedDF)
scaledDF = scalerModel.transform(transformedDF)
scaledDF.show()

+----+----+-----+--------------+--------------------+
|colA|colB| colC|      features|      scaledFeatures|
+----+----+-----+--------------+--------------------+
|   1| 4.5| true| [1.0,4.5,1.0]|[-1.2649110640673...|
|   2| 0.6| true| [2.0,0.6,1.0]|[-0.6324555320336...|
|   3| 1.5|false| [3.0,1.5,0.0]|[0.0,-0.449503858...|
|   4|12.1| true|[4.0,12.1,1.0]|[0.63245553203367...|
|   5| 0.0| true| [5.0,0.0,1.0]|[1.26491106406735...|
+----+----+-----+--------------+--------------------+



* Tokenizer(transformer)
Taking text (such as a sentence) and breaking it into individual terms (usually words). 

In [21]:
from pyspark.ml.feature import Tokenizer
sentenceDF = spark.createDataFrame([
 (0, "Hi I heard about Spark"),
 (1, "I wish we can have more Spark classes"),
], ["id", "sentence"])
tokenizer = Tokenizer(inputCol="sentence",
outputCol="words")
tokenizedDF = tokenizer.transform(sentenceDF)
tokenizedDF.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish we can hav...|[i, wish, we, can...|
+---+--------------------+--------------------+



* StringIndexer / IndexToString(estimator)

transformation of categorical class label into numerical one

In [30]:
from pyspark.ml.feature import StringIndexer
inputDF=spark.createDataFrame([
    ("Positive",[0.0,1.1,0.1]), ("Negative",[0.1,1.2,0.2]), ("Positive",[0.0,1.3,0.3]), ("Negative",[0.3,1.4,0.4]), ("Positive",[0.0,1.4,0.5])
],["categoricalLabel","features"])
indexer = StringIndexer(inputCol="categoricalLabel",outputCol="label")
indexerModel = indexer.fit(inputDF)
indexedDF=indexerModel.transform(inputDF)
indexedDF.show()

+----------------+---------------+-----+
|categoricalLabel|       features|label|
+----------------+---------------+-----+
|        Positive|[0.0, 1.1, 0.1]|  0.0|
|        Negative|[0.1, 1.2, 0.2]|  1.0|
|        Positive|[0.0, 1.3, 0.3]|  0.0|
|        Negative|[0.3, 1.4, 0.4]|  1.0|
|        Positive|[0.0, 1.4, 0.5]|  0.0|
+----------------+---------------+-----+



In [29]:
from pyspark.ml.feature import IndexToString
indexedDF2= indexedDF.select("features","label")
indexedDF2.show()
converter = IndexToString(inputCol="label",outputCol="originalLabel")
convertedDF2 = converter.transform(indexedDF2)
convertedDF2.show()

+---------------+-----+
|       features|label|
+---------------+-----+
|[0.0, 1.1, 0.1]|  0.0|
|[0.1, 1.2, 0.2]|  1.0|
|[0.0, 1.3, 0.3]|  0.0|
|[0.3, 1.4, 0.4]|  1.0|
|[0.0, 1.4, 0.5]|  0.0|
+---------------+-----+

+---------------+-----+-------------+
|       features|label|originalLabel|
+---------------+-----+-------------+
|[0.0, 1.1, 0.1]|  0.0|     Positive|
|[0.1, 1.2, 0.2]|  1.0|     Negative|
|[0.0, 1.3, 0.3]|  0.0|     Positive|
|[0.3, 1.4, 0.4]|  1.0|     Negative|
|[0.0, 1.4, 0.5]|  0.0|     Positive|
+---------------+-----+-------------+



* OneHotEncoderEstimator

In [3]:
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import StringIndexer
df = spark.createDataFrame([
 ('Fog', 30),
 ('Rain', 25),
 ('Sun',36),
], ["Weather", "Temperature"])
indexer = StringIndexer(inputCol="Weather", outputCol="WeatherIndex")

indexerModel = indexer.fit(df)
indexedDF=indexerModel.transform(df)
encoder = OneHotEncoderEstimator(inputCols=["WeatherIndex"],outputCols=["WeatherOneHot"])
model = encoder.fit(indexedDF)
encodedDF = model.transform(indexedDF)
encodedDF.show()

+-------+-----------+------------+-------------+
|Weather|Temperature|WeatherIndex|WeatherOneHot|
+-------+-----------+------------+-------------+
|    Fog|         30|         1.0|(2,[1],[1.0])|
|   Rain|         25|         0.0|(2,[0],[1.0])|
|    Sun|         36|         2.0|    (2,[],[])|
+-------+-----------+------------+-------------+

