## Generate Spark ML Decision Tree

### Step 0: Load Libraries, Data, and SparkSession

In [1]:
# You may need to Reconnect (more than Restart) the Kernel to pick up changes to these sett
import os

master = '--master spark://apachespark-master-2-1-0:7077'
conf = '--conf spark.cores.max=1 --conf spark.executor.memory=512m'
packages = '--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1'
jars = '--jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar'
py_files = '--py-files /root/lib/jpmml.py'

os.environ['PYSPARK_SUBMIT_ARGS'] = master \
  + ' ' + conf \
  + ' ' + packages \
  + ' ' + jars \
  + ' ' + py_files \
  + ' ' + 'pyspark-shell'

print(os.environ['PYSPARK_SUBMIT_ARGS'])

--master spark://apachespark-master-2-1-0:7077 --conf spark.cores.max=1 --conf spark.executor.memory=512m --packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 --jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar --py-files /root/lib/jpmml.py pyspark-shell


In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier

In [3]:
from pyspark.sql import SparkSession

sparkSession = SparkSession.builder.getOrCreate()

In [5]:
data = sparkSession.read.format("csv") \
  .option("inferSchema", "true").option("header", "true") \
  .load("s3a://datapalooza/R/census.csv")

data.head()

Row(age=39, workclass='State-gov', education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native_country='United-States', income='<=50K')

### Step 2: Build Spark ML Pipeline with Decision Tree Classifier

In [6]:
formula = RFormula(formula = "income ~ .")
classifier = DecisionTreeClassifier()

pipeline = Pipeline(stages = [formula, classifier])

pipelineModel = pipeline.fit(data)

print(pipelineModel)

PipelineModel_4a9fbe91e53dae9042fd


In [7]:
print(pipelineModel.stages[1].toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_42ba9a283ddea90a3c60) of depth 5 with 57 nodes
  If (feature 23 in {0.0})
   If (feature 52 <= 7443.0)
    If (feature 22 <= 13.0)
     If (feature 54 <= 42.0)
      If (feature 53 <= 1876.0)
       Predict: 0.0
      Else (feature 53 > 1876.0)
       Predict: 0.0
     Else (feature 54 > 42.0)
      If (feature 9 in {0.0})
       Predict: 0.0
      Else (feature 9 not in {0.0})
       Predict: 0.0
    Else (feature 22 > 13.0)
     If (feature 54 <= 43.0)
      If (feature 0 <= 32.0)
       Predict: 0.0
      Else (feature 0 > 32.0)
       Predict: 0.0
     Else (feature 54 > 43.0)
      If (feature 0 <= 32.0)
       Predict: 0.0
      Else (feature 0 > 32.0)
       Predict: 0.0
   Else (feature 52 > 7443.0)
    If (feature 0 <= 20.0)
     If (feature 8 in {0.0})
      Predict: 0.0
     Else (feature 8 not in {0.0})
      Predict: 1.0
    Else (feature 0 > 20.0)
     If (feature 0 <= 64.0)
      If (feature 2 in {1.0})
       P

### Step 3: Convert Spark ML Pipeline to PMML

In [11]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkSession, data, pipelineModel)

print(pmmlBytes.decode("utf-8"))

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
	<Header>
		<Application/>
		<Timestamp>2017-04-15T17:02:23Z</Timestamp>
	</Header>
	<DataDictionary>
		<DataField name="income" optype="categorical" dataType="string">
			<Value value="&lt;=50K"/>
			<Value value="&gt;50K"/>
		</DataField>
		<DataField name="workclass" optype="categorical" dataType="string">
			<Value value="Private"/>
			<Value value="Self-emp-not-inc"/>
			<Value value="Local-gov"/>
			<Value value="State-gov"/>
			<Value value="Self-emp-inc"/>
			<Value value="Federal-gov"/>
			<Value value="Without-pay"/>
		</DataField>
		<DataField name="education" optype="categorical" dataType="string">
			<Value value="HS-grad"/>
			<Value value="Some-college"/>
			<Value value="Bachelors"/>
			<Value value="Masters"/>
			<Value value="Assoc-voc"/>
			<Value value="11th"/>
			<Value value="Assoc-acdm"/>
			<Value value="10th"/>
			<Value value="7th-8th"/>
			<Value v

### Deployment Option 1:  Mutable Model Deployment

In [12]:
from urllib import request

update_url = 'http://prediction-pmml.demo.pipeline.io/update-pmml/default/pmml_census/v0'

update_headers = {}
update_headers['Content-type'] = 'application/xml'

req = request.Request(update_url, headers=update_headers, data=pmmlBytes)
resp = request.urlopen(req)

print(resp.status) # Should return Http Status 200 

200


In [13]:
from urllib import request

evaluate_url = 'http://prediction-pmml.demo.pipeline.io/evaluate-pmml/default/pmml_census/v0'

evaluate_headers = {}
evaluate_headers['Content-type'] = 'application/json'
input_params = '{"age":39,"workclass":"State-gov","education":"Bachelors","education_num":13,"marital_status":"Never-married","occupation":"Adm-clerical","relationship":"Not-in-family","race":"White","sex":"Male","capital_gain":2174,"capital_loss":0,"hours_per_week":40,"native_country":"United-States"}' 
encoded_input_params = input_params.encode('utf-8')

req = request.Request(evaluate_url, headers=evaluate_headers, data=encoded_input_params)
resp = request.urlopen(req)

print(resp.read()) # Should return valid classification with probabilities

b'{"results":[[{\'income\': \'NodeScoreDistribution{result=<=50K, probability_entries=[<=50K=0.9791280929136509, >50K=0.0208719070863491], entityId=6, confidence_entries=[]}\'}]]}'


### Model Server Dashboard
Fill in <your-ip> below, then copy/paste to your browser
```
http://<your-ip>:47979/hystrix-dashboard/monitor/monitor.html?streams=%5B%7B%22name%22%3A%22%22%2C%22stream%22%3A%22http%3A%2F%2F<your-ip>%3A39043%2Fhystrix.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%2C%7B%22name%22%3A%22%22%2C%22stream%22%3A%22http%3A%2F%2F<your-ip>%3A39042%2Fhystrix.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%2C%7B%22name%22%3A%22%22%2C%22stream%22%3A%22http%3A%2F%2F<your-ip>%3A39041%2Fhystrix.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%2C%7B%22name%22%3A%22%22%2C%22stream%22%3A%22http%3A%2F%2F<your-ip>%3A39040%2Fhystrix.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%5D
```

### TODO:  Deployment Option 2:  Immutable Model Deployment

Save Model to Disk

In [None]:
!mkdir -p /root/src/pmml/census/

with open('/root/src/pmml/census/pmml_census.pmml', 'wb') as f:
  f.write(pmmlBytes)

!ls /root/src/pmml/census/pmml_census.pmml

TODO:  Trigger Airflow to Build New Docker Image (ie. via Github commit)

### Load Test

In [None]:
!start-loadtest.sh $SOURCE_HOME/loadtest/RecommendationServiceStressTest-local-census.jmx