# Train Logistic Regression classifier with Apache SparkML


In [None]:
!pip install pyspark==2.4 pyspark2pmml==0.5.1 #mleap==0.15

In [None]:
!wget https://github.com/jpmml/jpmml-sparkml/releases/download/1.5.12/jpmml-sparkml-executable-1.5.12.jar

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark2pmml import PMMLBuilder

In [None]:
# @param data_parquet
# @param master Spark master
# @param model_target file name of model file (PMML)
# @returns model as PMML

In [None]:
data_parquet = os.environ.get('data_parquet', 'data.parquet')
master = os.environ.get('master', "local[*]")
model_target = os.environ.get('model_target', "model.pmml")
data_dir = os.environ.get('data_dir', '../../data/')

In [None]:
sc = SparkContext.getOrCreate(
    SparkConf().
    setMaster(master).
    set("spark.jars", 'jpmml-sparkml-executable-1.5.12.jar')
)

spark = SparkSession \
    .builder \
    .getOrCreate()

In [None]:
df = spark.read.parquet(data_dir+data_parquet)

In [None]:
# register a corresponding query table
df.createOrReplaceTempView('df')

In [None]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler


indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"],
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [None]:
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, lr])

In [None]:
model = pipeline.fit(df_train)

In [None]:
prediction = model.transform(df_train)

In [None]:
binEval = MulticlassClassificationEvaluator(). \
    setMetricName("accuracy"). \
    setPredictionCol("prediction"). \
    setLabelCol("label")

binEval.evaluate(prediction)

In [None]:
pmmlBuilder = PMMLBuilder(sc, df_train, model)
pmmlBuilder.buildFile(model_target)