From: https://spark.apache.org/docs/2.3.0/ml-pipeline.html

In [1]:
from pyspark.sql import SparkSession

# (8 cores, 16gb per machine) x 5 = 40 cores

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.87:7077") \
        .appName("machine_learning")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",4)\
        .getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression Estimator.
# (Uses regression to predict caregorical data. 
# In this case - 2 categories: 1 and 0 - i.e. binomial)

lr = LogisticRegression(maxIter=10)

In [3]:
# Training Data
training_df = spark_session.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])


# Fit a model, returns a transformer.
model1 = lr.fit(training_df)

# Print the co-efficients:
model1.coefficients

DenseVector([-7.9428, 6.8073, -1.0613])

In [4]:
# Test data
test_df = spark_session.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.

# Returns a data frame.
prediction_df = model1.transform(test_df)

result = prediction_df.select("features", "label", "probability", "prediction").collect()

for row in result:
    print("features=%s, label=%s -> prob=%s, prediction=%s"
          % (row.features, row.label, row.probability, row.prediction))

features=[-1.0,1.5,1.3], label=1.0 -> prob=[4.509635008134821e-08,0.9999999549036498], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 -> prob=[0.9999530918356334,4.690816436651706e-05], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 -> prob=[5.5414858687235356e-08,0.9999999445851413], prediction=1.0


In [5]:
#spark_session.stop()