## Display Environment Vars

In [1]:
!export

export AIRFLOW_HOME='/root/airflow'
export AKKA_VERSION='2.3.11'
export ALGEBIRD_VERSION='0.11.0'
export ANKUR_PART_VERSION='0.1'
export ATLAS_HOME='/root/atlas-1.4.5'
export ATLAS_VERSION='1.4.5'
export BAZEL_HOME='/root/bazel-0.3.0'
export BAZEL_VERSION='0.3.0'
export BETTER_FILES_VERSION='2.14.0'
export CASSANDRA_HOME='/root/apache-cassandra-2.2.6'
export CASSANDRA_VERSION='2.2.6'
export CLICOLOR='1'
export CODAHALE_METRICS_VERSION='3.1.2'
export COMMONS_DAEMON_VERSION='1.0.15'
export CONFIG_HOME='/root/pipeline/config'
export CONFLUENT_HOME='/root/confluent-3.0.0'
export CONFLUENT_VERSION='3.0.0'
export DATASETS_HOME='/root/pipeline/datasets'
export DEV_INSTALL_HOME='/root'
export DYNOMITE_HOME='/root/dynomite'
export DYNO_VERSION='1.4.6'
export ELASTICSEARCH_HOME='/root/elasticsearch-2.3.0'
export ELASTICSEARCH_VERSION='2.3.0'
export FINAGLE_VERSION='6.34.0'
export FLINK_HOME='/root/flink-1.0.0'
export FLINK_VERSION='1.0.0'
export GENSORT_VERSION='1.5'
e

## Setup Spark and SQL Contexts

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)

sqlContext

<pyspark.sql.context.SQLContext at 0x7f0850f142b0>

## Setup S3 Credentials

In [3]:
hadoopConf = sparkContext._jsc.hadoopConfiguration()
# Set your AWS Credentials here
myAccessKey = ""
mySecretKey = ""
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3.awsSecretAccessKey", mySecretKey)

## Load Dataset into Spark Cluster

In [4]:
data = sqlContext.read.csv("s3://fluxcapacitor.com/datasets/R/wine.csv", header=True, inferSchema=True)
data.take(10)

AttributeError: 'DataFrameReader' object has no attribute 'csv'

## Build Decision Tree (Regression) with Spark ML Pipeline

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import DecisionTreeRegressor


formula = RFormula(formula = "quality ~ .")
regressor = DecisionTreeRegressor()
pipeline = Pipeline(stages = [formula, regressor])
pipelineModel = pipeline.fit(data)

pipelineModel

## Convert Spark ML Model and Pipeline to PMML

In [None]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkContext, data, pipelineModel)

str(pmmlBytes)