## Demo_3: Machine Learning Pipeline

Source: https://spark.apache.org/docs/latest/ml-pipeline.html

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [0]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [0]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

In [0]:
# Prepare test documents
test = spark.createDataFrame([
    (4, "spark i j k", 1.0),
    (5, "l m n", 0.0),
    (6, "spark hadoop spark", 1.0),
    (7, "apache hadoop", 0.0)
], ["id", "text", "label"])

In [0]:
# Make predictions on test documents and print columns of interest.
pred_test = model.transform(test)
pred_test.show()

+---+------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
| id|              text|label|               words|            features|       rawPrediction|         probability|prediction|
+---+------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  4|       spark i j k|  1.0|    [spark, i, j, k]|(262144,[19036,68...|[0.52882855227968...|[0.62920984896684...|       0.0|
|  5|             l m n|  0.0|           [l, m, n]|(262144,[1303,526...|[4.16914139534005...|[0.98477000676230...|       0.0|
|  6|spark hadoop spark|  1.0|[spark, hadoop, s...|(262144,[173558,1...|[-1.8649814141188...|[0.13412348342566...|       1.0|
|  7|     apache hadoop|  0.0|    [apache, hadoop]|(262144,[68303,19...|[5.41564427200184...|[0.99557321143985...|       0.0|
+---+------------------+-----+--------------------+--------------------+--------------------+--------------------+----

In [0]:
# compute accuracy on the test set
predictionAndLabels = pred_test.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.75
