From: https://spark.apache.org/docs/2.3.0/ml-pipeline.html

In [1]:
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer


# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

In [2]:
# Example - classify whether documents relate to BigData

# Prepare training documents from a list of (id, text, label) tuples.
training_df = spark_session.createDataFrame([
    (0, "a b c d e spark spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.

# Transformer 1: Tokenizer (splits up words)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
example_df = tokenizer.transform(training_df)
print(example_df.collect())
print('\n\n')

# Transformer 2: Convert Words into word frequencies (TF = "Term Frequency")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
example_df = hashingTF.transform(example_df)
print(example_df.collect())
print('\n\n')


# Model: Logistic Regression
lr = LogisticRegression(maxIter=10)


# Put them together as a pipeline.
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

[Row(id=0, text='a b c d e spark spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark', 'spark']), Row(id=1, text='b d', label=0.0, words=['b', 'd']), Row(id=2, text='spark f f g h', label=1.0, words=['spark', 'f', 'f', 'g', 'h']), Row(id=3, text='hadoop mapreduce', label=0.0, words=['hadoop', 'mapreduce'])]



[Row(id=0, text='a b c d e spark spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark', 'spark'], features=SparseVector(262144, {17222: 1.0, 27526: 1.0, 28698: 1.0, 30913: 1.0, 227410: 1.0, 234657: 2.0})), Row(id=1, text='b d', label=0.0, words=['b', 'd'], features=SparseVector(262144, {27526: 1.0, 30913: 1.0})), Row(id=2, text='spark f f g h', label=1.0, words=['spark', 'f', 'f', 'g', 'h'], features=SparseVector(262144, {15554: 1.0, 24152: 2.0, 51505: 1.0, 234657: 1.0})), Row(id=3, text='hadoop mapreduce', label=0.0, words=['hadoop', 'mapreduce'], features=SparseVector(262144, {42633: 1.0, 155117: 1.0}))]





In [3]:
# Fit the pipeline to training documents.
model = pipeline.fit(training_df)

# Prepare test documents, which are unlabeled (id, text) tuples.
test_df = spark_session.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction_df = model.transform(test_df)
selected_df = prediction_df.select("id", "text", "probability", "prediction")
for row in selected_df.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

(4, spark i j k) --> prob=[0.22303410097016266,0.7769658990298373], prediction=1.000000
(5, l m n) --> prob=[0.7809592403359353,0.21904075966406467], prediction=0.000000
(6, spark hadoop spark) --> prob=[0.4167570490507484,0.5832429509492517], prediction=1.000000
(7, apache hadoop) --> prob=[0.9910096584033983,0.008990341596601767], prediction=0.000000
