## Demo_3: Machine Learning Pipeline

Source: https://spark.apache.org/docs/latest/ml-pipeline.html


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [0]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [0]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [0]:
# Make predictions on train documents and print columns of interest.
pred_train = model.transform(training)
pred_train.drop('rawPrediction').show(truncate = False)

+---+----------------+-----+----------------------+----------------------------------------------------------------------------+------------------------------------------+----------+
|id |text            |label|words                 |features                                                                    |probability                               |prediction|
+---+----------------+-----+----------------------+----------------------------------------------------------------------------+------------------------------------------+----------+
|0  |a b c d e spark |1.0  |[a, b, c, d, e, spark]|(262144,[74920,89530,107107,148981,167694,173558],[1.0,1.0,1.0,1.0,1.0,1.0])|[0.002628213496942035,0.9973717865030579] |1.0       |
|1  |b d             |0.0  |[b, d]                |(262144,[89530,148981],[1.0,1.0])                                           |[0.9963902711801113,0.0036097288198887467]|0.0       |
|2  |spark f g h     |1.0  |[spark, f, g, h]      |(262144,[36803,173558,209078,22815

In [0]:
# Prepare test documents
test = spark.createDataFrame([
    (4, "spark i j k", 1.0),
    (5, "l m n", 0.0),
    (6, "spark hadoop spark", 1.0),
    (7, "apache hadoop", 0.0)
], ["id", "text", "label"])

In [0]:
# Make predictions on test documents and print columns of interest.
pred_test = model.transform(test)
pred_test.drop('rawPrediction').show(truncate = False)

+---+------------------+-----+----------------------+------------------------------------------------------+----------------------------------------+----------+
|id |text              |label|words                 |features                                              |probability                             |prediction|
+---+------------------+-----+----------------------+------------------------------------------------------+----------------------------------------+----------+
|4  |spark i j k       |1.0  |[spark, i, j, k]      |(262144,[19036,68693,173558,213660],[1.0,1.0,1.0,1.0])|[0.6292098489668488,0.37079015103315116]|0.0       |
|5  |l m n             |0.0  |[l, m, n]             |(262144,[1303,52644,248090],[1.0,1.0,1.0])            |[0.984770006762304,0.015229993237696027]|0.0       |
|6  |spark hadoop spark|1.0  |[spark, hadoop, spark]|(262144,[173558,198017],[2.0,1.0])                    |[0.13412348342566147,0.8658765165743385]|1.0       |
|7  |apache hadoop     |0.0  |[apa

In [0]:
# compute accuracy on the test set
predictionAndLabels = pred_test.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.75
