In [34]:
from pyspark import SparkContext
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [35]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("../Data/penbased.dat", 4).map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])

In [36]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = sqlContext.createDataFrame(pen_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15],x[16])), penschema)

In [37]:
# Create Training and Test data.
pendtsets = dfpen.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

In [38]:
# Transformer - Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.

In [39]:
# Estimator - DecisionTreeClassifier which creates a transformer (Decision Tree Classifier model)
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)

In [40]:
# Fit the pipeline to training documents.
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[va,dt])
dtmodel = pipeline.fit(pendttrain)

In [44]:
dtpredicts = dtmodel.transform(pendtvalid)

In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtpredicts)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0364635


In [43]:
dtpredicts.select('label','prediction').rdd.map(lambda x : (x,1)).countByKey() 

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 218,
             Row(label=0.0, prediction=1.0): 1,
             Row(label=0.0, prediction=3.0): 1,
             Row(label=0.0, prediction=4.0): 1,
             Row(label=0.0, prediction=8.0): 2,
             Row(label=0.0, prediction=9.0): 1,
             Row(label=1.0, prediction=1.0): 198,
             Row(label=1.0, prediction=2.0): 11,
             Row(label=1.0, prediction=3.0): 1,
             Row(label=1.0, prediction=4.0): 1,
             Row(label=1.0, prediction=5.0): 1,
             Row(label=1.0, prediction=7.0): 3,
             Row(label=1.0, prediction=9.0): 1,
             Row(label=2.0, prediction=1.0): 7,
             Row(label=2.0, prediction=2.0): 182,
             Row(label=2.0, prediction=3.0): 1,
             Row(label=2.0, prediction=7.0): 1,
             Row(label=3.0, prediction=2.0): 2,
             Row(label=3.0, prediction=3.0): 207,
             Row(label=3.0, prediction=5.0): 2,
             R