## Explore the Environment

### Setup Environment

In [1]:
!cd ~ && git clone --recursive https://github.com/fluxcapacitor/pipeline.io

fatal: destination path 'pipeline.io' already exists and is not an empty directory.


### Explore Environment

In [2]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-training.sh

switched to context "aws_k8s_training".


In [3]:
!kubectl get pod

NAME                                 READY     STATUS    RESTARTS   AGE
airflow-180xs                        1/1       Running   0          12d
airpal-2h0l7                         1/1       Running   0          12d
cassandra-de9oe                      1/1       Running   0          12d
clustered-tensorflow-master-xsqbd    1/1       Running   0          1d
clustered-tensorflow-ps0-rlihr       1/1       Running   0          1d
clustered-tensorflow-ps1-2l181       1/1       Running   0          1d
clustered-tensorflow-worker0-3o5yv   1/1       Running   0          1d
clustered-tensorflow-worker1-duuf8   1/1       Running   0          1d
elasticsearch-2-3-0-er0i5            1/1       Running   0          12d
hystrix-xlbp6                        1/1       Running   0          2h
jupyterhub-master-26yr8              1/1       Running   0          1d
kafka-0-8-e653j                      1/1       Running   0          12d
kibana-4-5-0-crrbl                   1/1       Running   0

### [Training Cluster](http://kubernetes.demo.pipeline.io/)

In [2]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://kubernetes.demo.pipeline.io">'
display(HTML(html))

In [5]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-predictions.sh

switched to context "aws_k8s_predictions".


In [6]:
!kubectl get pod

NAME                       READY     STATUS    RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running   0          55m
prediction-codegen-y7ovp   1/1       Running   0          6m
prediction-pmml-fwric      1/1       Running   0          1h
prediction-pmml-ndwai      1/1       Running   0          6m
turbine-g4j7q              1/1       Running   0          1h
weavescope-app-oh0mh       1/1       Running   0          1h
weavescope-probe-jwdo4     1/1       Running   0          1h


### [Prediction Cluster - AWS](http://kubernetes-aws.demo.pipeline.io/)

In [7]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://kubernetes-aws.demo.pipeline.io">'
display(HTML(html))

In [8]:
!/root/pipeline.io/bin/pipeline-context-switch-gcp-predictions.sh

switched to context "gcp_k8s_predictions".


In [9]:
!kubectl get pod

NAME                       READY     STATUS    RESTARTS   AGE
prediction-codegen-gvmko   1/1       Running   0          56m
prediction-codegen-z8puw   1/1       Running   0          6m
prediction-pmml-jz21l      1/1       Running   0          1h
prediction-pmml-mbwy1      1/1       Running   0          6m
turbine-u6588              1/1       Running   0          1h
weavescope-app-nrc6o       1/1       Running   0          1h
weavescope-probe-5rogi     1/1       Running   0          1h


### [Prediction Cluster - GCP](http://kubernetes-gcp.demo.pipeline.io/)

In [1]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://kubernetes-gcp.demo.pipeline.io">'
display(HTML(html))

## Generate Spark ML Decision Tree

### Scale Out Spark Cluster

In [11]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-training.sh

switched to context "aws_k8s_training".


In [12]:
!/root/pipeline.io/bin/pipeline-deployment-scale-out-training.sh

replicationcontroller "spark-worker-2-0-1" scaled


In [13]:
!kubectl get pod

NAME                                 READY     STATUS              RESTARTS   AGE
airflow-180xs                        1/1       Running             0          12d
airpal-2h0l7                         1/1       Running             0          12d
cassandra-de9oe                      1/1       Running             0          12d
clustered-tensorflow-master-xsqbd    1/1       Running             0          1d
clustered-tensorflow-ps0-rlihr       1/1       Running             0          1d
clustered-tensorflow-ps1-2l181       1/1       Running             0          1d
clustered-tensorflow-worker0-3o5yv   1/1       Running             0          1d
clustered-tensorflow-worker1-duuf8   1/1       Running             0          1d
elasticsearch-2-3-0-er0i5            1/1       Running             0          12d
hystrix-xlbp6                        1/1       Running             0          2h
jupyterhub-master-26yr8              1/1       Running             0          1d
kafka-0-8-e

### [Spark Admin](http://spark.demo.pipeline.io)

### [Training Cluster](http://kubernetes.demo.pipeline.io/)

In [14]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://kubernetes.demo.pipeline.io">'
display(HTML(html))

### Setup SparkSession

In [15]:
from pyspark.sql import SparkSession

sparkSession = SparkSession.builder.getOrCreate()

### Load Training Dataset from S3 into Spark

In [16]:
data = sparkSession.read.format("csv") \
  .option("inferSchema", "true").option("header", "true") \
  .load("s3a://datapalooza/R/census.csv")

data.head()

Row(age=39, workclass='State-gov', education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native_country='United-States', income='<=50K')

### Build Spark ML Pipeline with Decision Tree Classifier

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier

formula = RFormula(formula = "income ~ .")
classifier = DecisionTreeClassifier()

pipeline = Pipeline(stages = [formula, classifier])

pipelineModel = pipeline.fit(data)

print(pipelineModel)

PipelineModel_48228db25059e10b95f0


In [18]:
print(pipelineModel.stages[1].toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_43d0b800f7ece489f7a8) of depth 5 with 59 nodes
  If (feature 23 in {0.0})
   If (feature 52 <= 7443.0)
    If (feature 22 <= 13.0)
     If (feature 54 <= 42.0)
      If (feature 0 <= 33.0)
       Predict: 0.0
      Else (feature 0 > 33.0)
       Predict: 0.0
     Else (feature 54 > 42.0)
      If (feature 9 in {0.0})
       Predict: 0.0
      Else (feature 9 not in {0.0})
       Predict: 0.0
    Else (feature 22 > 13.0)
     If (feature 54 <= 43.0)
      If (feature 0 <= 32.0)
       Predict: 0.0
      Else (feature 0 > 32.0)
       Predict: 0.0
     Else (feature 54 > 43.0)
      If (feature 0 <= 32.0)
       Predict: 0.0
      Else (feature 0 > 32.0)
       Predict: 0.0
   Else (feature 52 > 7443.0)
    If (feature 0 <= 20.0)
     If (feature 8 in {0.0})
      Predict: 0.0
     Else (feature 8 not in {0.0})
      Predict: 1.0
    Else (feature 0 > 20.0)
     If (feature 40 in {1.0})
      If (feature 0 <= 37.0)
       Predic

## Convert Spark ML Pipeline to PMML

In [19]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkSession, data, pipelineModel)

pmmlBytes.decode("utf-8")

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">\n\t<Header>\n\t\t<Application/>\n\t\t<Timestamp>2016-11-24T17:55:43Z</Timestamp>\n\t</Header>\n\t<DataDictionary>\n\t\t<DataField name="income" optype="categorical" dataType="string">\n\t\t\t<Value value="&lt;=50K"/>\n\t\t\t<Value value="&gt;50K"/>\n\t\t</DataField>\n\t\t<DataField name="workclass" optype="categorical" dataType="string">\n\t\t\t<Value value="Private"/>\n\t\t\t<Value value="Self-emp-not-inc"/>\n\t\t\t<Value value="Local-gov"/>\n\t\t\t<Value value="State-gov"/>\n\t\t\t<Value value="Self-emp-inc"/>\n\t\t\t<Value value="Federal-gov"/>\n\t\t\t<Value value="Without-pay"/>\n\t\t</DataField>\n\t\t<DataField name="education" optype="categorical" dataType="string">\n\t\t\t<Value value="HS-grad"/>\n\t\t\t<Value value="Some-college"/>\n\t\t\t<Value value="Bachelors"/>\n\t\t\t<Value value="Masters"/>\n\t\t\t<Value value="Assoc-voc"/>\n\t\t\t<Value value="11th"/>\n\t\t\

## Deployment Option 1:  Mutable Model Deployment

### Deploy New Model to Live, Running Model Server

In [59]:
from urllib import request

update_url = 'http://prediction-pmml-aws.demo.pipeline.io/update-pmml/census'

update_headers = {}
update_headers['Content-type'] = 'application/xml'

req = request.Request(update_url, headers=update_headers, data=pmmlBytes)
resp = request.urlopen(req)

print(resp.status) # Should return Http Status 200 

200


In [60]:
from urllib import request

update_url = 'http://prediction-pmml-gcp.demo.pipeline.io/update-pmml/census'

update_headers = {}
update_headers['Content-type'] = 'application/xml'

req = request.Request(update_url, headers=update_headers, data=pmmlBytes)
resp = request.urlopen(req)

print(resp.status) # Should return Http Status 200 

200


### Test New Model on Live, Running Model Server

In [22]:
from urllib import request

evaluate_url = 'http://prediction-pmml-aws.demo.pipeline.io/evaluate-pmml/census'

evaluate_headers = {}
evaluate_headers['Content-type'] = 'application/json'
input_params = '{"age":39,"workclass":"State-gov","education":"Bachelors","education_num":13,"marital_status":"Never-married","occupation":"Adm-clerical","relationship":"Not-in-family","race":"White","sex":"Male","capital_gain":2174,"capital_loss":0,"hours_per_week":40,"native_country":"United-States"}' 
encoded_input_params = input_params.encode('utf-8')

req = request.Request(evaluate_url, headers=evaluate_headers, data=encoded_input_params)
resp = request.urlopen(req)

print(resp.read()) # Should return valid classification with probabilities

b'{"results":[[{\'income\': \'NodeScoreDistribution{result=<=50K, probability_entries=[<=50K=0.9564524694636218, >50K=0.04354753053637812], entityId=7, confidence_entries=[]}\'}]]'


In [23]:
from urllib import request
import json

evaluate_url = 'http://prediction-pmml-gcp.demo.pipeline.io/evaluate-pmml/census'

evaluate_headers = {}
evaluate_headers['Content-type'] = 'application/json'
input_params = '{"age":39,"workclass":"State-gov","education":"Bachelors","education_num":13,"marital_status":"Never-married","occupation":"Adm-clerical","relationship":"Not-in-family","race":"White","sex":"Male","capital_gain":2174,"capital_loss":0,"hours_per_week":40,"native_country":"United-States"}' 
encoded_input_params = input_params.encode('utf-8')

req = request.Request(evaluate_url, headers=evaluate_headers, data=encoded_input_params)
resp = request.urlopen(req)

print(resp.read()) # Should return valid classification with probabilities

b'{"results":[[{\'income\': \'NodeScoreDistribution{result=<=50K, probability_entries=[<=50K=0.9564177517937815, >50K=0.04358224820621844], entityId=7, confidence_entries=[]}\'}]]'


## Deployment Option 2:  Immutable Model Deployment

### Save Model to Disk

In [24]:
with open('/root/census.pmml', 'wb') as f:
  f.write(pmmlBytes)

!cat /root/census.pmml

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
	<Header>
		<Application/>
		<Timestamp>2016-11-24T17:55:43Z</Timestamp>
	</Header>
	<DataDictionary>
		<DataField name="income" optype="categorical" dataType="string">
			<Value value="&lt;=50K"/>
			<Value value="&gt;50K"/>
		</DataField>
		<DataField name="workclass" optype="categorical" dataType="string">
			<Value value="Private"/>
			<Value value="Self-emp-not-inc"/>
			<Value value="Local-gov"/>
			<Value value="State-gov"/>
			<Value value="Self-emp-inc"/>
			<Value value="Federal-gov"/>
			<Value value="Without-pay"/>
		</DataField>
		<DataField name="education" optype="categorical" dataType="string">
			<Value value="HS-grad"/>
			<Value value="Some-college"/>
			<Value value="Bachelors"/>
			<Value value="Masters"/>
			<Value value="Assoc-voc"/>
			<Value value="11th"/>
			<Value value="Assoc-acdm"/>
			<Value value="10th"/>
			<Value 

### Commit to Github

In [25]:
# Note:  You may need to run this from a terminal in order to set creds
#!/root/datasticks-github-push.sh

### Deploy New Model Server with New Model from Github

In [26]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-predictions.sh

switched to context "aws_k8s_predictions".


In [27]:
!kubectl get pod

NAME                       READY     STATUS    RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running   0          56m
prediction-codegen-y7ovp   1/1       Running   0          7m
prediction-pmml-fwric      1/1       Running   0          1h
prediction-pmml-ndwai      1/1       Running   0          7m
turbine-g4j7q              1/1       Running   0          1h
weavescope-app-oh0mh       1/1       Running   0          1h
weavescope-probe-jwdo4     1/1       Running   0          1h


## Load Test Predictions Across AWS and Google Cloud

### [Prediction Services Dashboard](http://hystrix.demo.pipeline.io/hystrix-dashboard/monitor/monitor.html?streams=%5B%7B%22name%22%3A%22Predictions%20-%20AWS%22%2C%22stream%22%3A%22http%3A%2F%2Fturbine-aws.demo.pipeline.io%2Fturbine.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%2C%7B%22name%22%3A%22Predictions%20-%20GCP%22%2C%22stream%22%3A%22http%3A%2F%2Fturbine-gcp.demo.pipeline.io%2Fturbine.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%5D)

In [28]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://hystrix.demo.pipeline.io/hystrix-dashboard/monitor/monitor.html?streams=%5B%7B%22name%22%3A%22Predictions%20-%20AWS%22%2C%22stream%22%3A%22http%3A%2F%2Fturbine-aws.demo.pipeline.io%2Fturbine.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%2C%7B%22name%22%3A%22Predictions%20-%20GCP%22%2C%22stream%22%3A%22http%3A%2F%2Fturbine-gcp.demo.pipeline.io%2Fturbine.stream%22%2C%22auth%22%3A%22%22%2C%22delay%22%3A%22%22%7D%5D">'
display(HTML(html))

In [29]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-training.sh

switched to context "aws_k8s_training".


## Scale Out Model Servers

### AWS

In [30]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-predictions.sh

switched to context "aws_k8s_predictions".


In [31]:
!kubectl get pod

NAME                       READY     STATUS    RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running   0          56m
prediction-codegen-y7ovp   1/1       Running   0          7m
prediction-pmml-fwric      1/1       Running   0          1h
prediction-pmml-ndwai      1/1       Running   0          7m
turbine-g4j7q              1/1       Running   0          1h
weavescope-app-oh0mh       1/1       Running   0          1h
weavescope-probe-jwdo4     1/1       Running   0          1h


In [32]:
!/root/pipeline.io/bin/pipeline-deployment-scale-out-predictions.sh

replicationcontroller "prediction-pmml" scaled


In [33]:
!kubectl get pod

NAME                       READY     STATUS              RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running             0          56m
prediction-codegen-y7ovp   1/1       Running             0          7m
prediction-pmml-14s4b      0/1       ContainerCreating   0          0s
prediction-pmml-fwric      1/1       Running             0          1h
prediction-pmml-jwh9q      0/1       ContainerCreating   0          0s
prediction-pmml-l9ksy      0/1       ContainerCreating   0          0s
prediction-pmml-ndwai      1/1       Running             0          7m
turbine-g4j7q              1/1       Running             0          1h
weavescope-app-oh0mh       1/1       Running             0          1h
weavescope-probe-jwdo4     1/1       Running             0          1h


### Google

In [34]:
!/root/pipeline.io/bin/pipeline-context-switch-gcp-predictions.sh

switched to context "gcp_k8s_predictions".


In [35]:
!/root/pipeline.io/bin/pipeline-deployment-scale-out-predictions.sh

replicationcontroller "prediction-pmml" scaled


In [36]:
!kubectl get pod

NAME                       READY     STATUS              RESTARTS   AGE
prediction-codegen-gvmko   1/1       Running             0          57m
prediction-codegen-z8puw   1/1       Running             0          7m
prediction-pmml-ap34w      0/1       ContainerCreating   0          0s
prediction-pmml-j08ot      0/1       ContainerCreating   0          0s
prediction-pmml-jz21l      1/1       Running             0          1h
prediction-pmml-mbwy1      1/1       Running             0          7m
prediction-pmml-w7niw      0/1       ContainerCreating   0          0s
turbine-u6588              1/1       Running             0          1h
weavescope-app-nrc6o       1/1       Running             0          1h
weavescope-probe-5rogi     1/1       Running             0          1h


## Scale In and Cleanup

### Training - Spark

In [37]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-training.sh

switched to context "aws_k8s_training".


In [38]:
!kubectl get pod

NAME                                 READY     STATUS              RESTARTS   AGE
airflow-180xs                        1/1       Running             0          12d
airpal-2h0l7                         1/1       Running             0          12d
cassandra-de9oe                      1/1       Running             0          12d
clustered-tensorflow-master-xsqbd    1/1       Running             0          1d
clustered-tensorflow-ps0-rlihr       1/1       Running             0          1d
clustered-tensorflow-ps1-2l181       1/1       Running             0          1d
clustered-tensorflow-worker0-3o5yv   1/1       Running             0          1d
clustered-tensorflow-worker1-duuf8   1/1       Running             0          1d
elasticsearch-2-3-0-er0i5            1/1       Running             0          12d
hystrix-xlbp6                        1/1       Running             0          2h
jupyterhub-master-26yr8              1/1       Running             0          1d
kafka-0-8-e

In [39]:
!/root/pipeline.io/bin/pipeline-deployment-scale-in-training.sh

replicationcontroller "spark-worker-2-0-1" scaled


In [40]:
!kubectl get pod

NAME                                 READY     STATUS        RESTARTS   AGE
airflow-180xs                        1/1       Running       0          12d
airpal-2h0l7                         1/1       Running       0          12d
cassandra-de9oe                      1/1       Running       0          12d
clustered-tensorflow-master-xsqbd    1/1       Running       0          1d
clustered-tensorflow-ps0-rlihr       1/1       Running       0          1d
clustered-tensorflow-ps1-2l181       1/1       Running       0          1d
clustered-tensorflow-worker0-3o5yv   1/1       Running       0          1d
clustered-tensorflow-worker1-duuf8   1/1       Running       0          1d
elasticsearch-2-3-0-er0i5            1/1       Running       0          12d
hystrix-xlbp6                        1/1       Running       0          2h
jupyterhub-master-26yr8              1/1       Running       0          1d
kafka-0-8-e653j                      1/1       Running       0          12d
kibana

### AWS

In [41]:
!/root/pipeline.io/bin/pipeline-context-switch-aws-predictions.sh

switched to context "aws_k8s_predictions".


In [42]:
!kubectl get pod

NAME                       READY     STATUS              RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running             0          56m
prediction-codegen-y7ovp   1/1       Running             0          7m
prediction-pmml-14s4b      0/1       ContainerCreating   0          3s
prediction-pmml-fwric      1/1       Running             0          1h
prediction-pmml-jwh9q      0/1       ContainerCreating   0          3s
prediction-pmml-l9ksy      0/1       ContainerCreating   0          3s
prediction-pmml-ndwai      1/1       Running             0          7m
turbine-g4j7q              1/1       Running             0          1h
weavescope-app-oh0mh       1/1       Running             0          1h
weavescope-probe-jwdo4     1/1       Running             0          1h


In [43]:
!/root/pipeline.io/bin/datasticks-deployment-scale-in-predictions.sh

/bin/sh: 1: /root/pipeline.io/bin/datasticks-deployment-scale-in-predictions.sh: not found


In [44]:
!kubectl get pod

NAME                       READY     STATUS              RESTARTS   AGE
prediction-codegen-wiu69   1/1       Running             0          56m
prediction-codegen-y7ovp   1/1       Running             0          7m
prediction-pmml-14s4b      0/1       ContainerCreating   0          3s
prediction-pmml-fwric      1/1       Running             0          1h
prediction-pmml-jwh9q      0/1       ContainerCreating   0          3s
prediction-pmml-l9ksy      0/1       ContainerCreating   0          3s
prediction-pmml-ndwai      1/1       Running             0          7m
turbine-g4j7q              1/1       Running             0          1h
weavescope-app-oh0mh       1/1       Running             0          1h
weavescope-probe-jwdo4     1/1       Running             0          1h


### Google

In [45]:
!/root/pipeline.io/bin/pipeline-context-switch-gcp-predictions.sh

switched to context "gcp_k8s_predictions".


In [46]:
!kubectl get pod

NAME                       READY     STATUS              RESTARTS   AGE
prediction-codegen-gvmko   1/1       Running             0          57m
prediction-codegen-z8puw   1/1       Running             0          7m
prediction-pmml-ap34w      0/1       ContainerCreating   0          3s
prediction-pmml-j08ot      0/1       ContainerCreating   0          3s
prediction-pmml-jz21l      1/1       Running             0          1h
prediction-pmml-mbwy1      1/1       Running             0          7m
prediction-pmml-w7niw      0/1       ContainerCreating   0          3s
turbine-u6588              1/1       Running             0          1h
weavescope-app-nrc6o       1/1       Running             0          1h
weavescope-probe-5rogi     1/1       Running             0          1h


In [47]:
!/root/pipeline.io/bin/pipeline-deployment-scale-in-predictions.sh

replicationcontroller "prediction-pmml" scaled


In [48]:
!kubectl get pod

NAME                       READY     STATUS        RESTARTS   AGE
prediction-codegen-gvmko   1/1       Running       0          57m
prediction-codegen-z8puw   1/1       Running       0          7m
prediction-pmml-ap34w      0/1       Terminating   0          4s
prediction-pmml-j08ot      0/1       Terminating   0          4s
prediction-pmml-jz21l      1/1       Running       0          1h
prediction-pmml-mbwy1      1/1       Terminating   0          7m
prediction-pmml-w7niw      0/1       Terminating   0          4s
turbine-u6588              1/1       Running       0          1h
weavescope-app-nrc6o       1/1       Running       0          1h
weavescope-probe-5rogi     1/1       Running       0          1h


In [3]:
from IPython.display import display, HTML

html = '<iframe width=100% height=500px src="http://airflow.demo.pipeline.io">'
display(HTML(html))