# Train Logistic Regression classifier with Apache SparkML


In [1]:
!pip install pyspark==2.4 pyspark2pmml==0.5.1 #mleap==0.15



In [2]:
!wget https://github.com/jpmml/jpmml-sparkml/releases/download/1.5.12/jpmml-sparkml-executable-1.5.12.jar

--2021-05-12 20:20:03--  https://github.com/jpmml/jpmml-sparkml/releases/download/1.5.12/jpmml-sparkml-executable-1.5.12.jar
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/58119316/8010a780-8fc0-11eb-9a54-04ad5b612228?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210512%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210512T202003Z&X-Amz-Expires=300&X-Amz-Signature=67abfb42c219cd7f0fa0dd98639bc5030ba5490a236e61769bec6e5db4aba59e&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=58119316&response-content-disposition=attachment%3B%20filename%3Djpmml-sparkml-executable-1.5.12.jar&response-content-type=application%2Foctet-stream [following]
--2021-05-12 20:20:03--  https://github-releases.githubusercontent.com/58119316/8010a780-8fc0-11eb-9a54-04ad5b612228?X-Amz-Algorithm=AWS4-HMAC-SHA2

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark2pmml import PMMLBuilder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

In [4]:
# @param data_parquet
# @param master Spark master
# @param model_target file name of model file (PMML)
# @returns model as PMML

In [5]:
data_parquet = os.environ.get('data_parquet', 'data.parquet')
master = os.environ.get('master', "local[*]")
model_target = os.environ.get('model_target', "model.pmml")
data_dir = os.environ.get('data_dir', '../../data/')

In [6]:
sc = SparkContext.getOrCreate(
    SparkConf().
    setMaster(master).
    set("spark.jars", 'jpmml-sparkml-executable-1.5.12.jar')
)

spark = SparkSession \
    .builder \
    .getOrCreate()

In [7]:
df = spark.read.parquet(data_dir+data_parquet)

In [8]:
# register a corresponding query table
df.createOrReplaceTempView('df')

In [9]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [10]:
indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=["x", "y", "z"],
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

In [11]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [12]:
pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, lr])

In [13]:
model = pipeline.fit(df_train)

In [14]:
prediction = model.transform(df_train)

In [15]:
binEval = MulticlassClassificationEvaluator(). \
    setMetricName("accuracy"). \
    setPredictionCol("prediction"). \
    setLabelCol("label")

binEval.evaluate(prediction)

0.20668373693116238

In [16]:
pmmlBuilder = PMMLBuilder(sc, df_train, model)
pmmlBuilder.buildFile(model_target)

'/resources/component-library/train/model.pmml'