In [2]:
from pyspark import SparkContext
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [91]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("../Data/penbased.dat", 4).map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])
pen_raw.take(1)

[[47.0,
  100.0,
  27.0,
  81.0,
  57.0,
  37.0,
  26.0,
  0.0,
  0.0,
  23.0,
  56.0,
  53.0,
  100.0,
  90.0,
  40.0,
  98.0,
  8.0]]

In [7]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = sqlContext.createDataFrame(pen_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15],x[16])), penschema)

In [8]:
dfpen.show()

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

In [9]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.
penlpoints = va.transform(dfpen).select("features", "label")

In [10]:
penlpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
|[100.0,100.0,88.0...|  6.0|
|[0.0,100.0,3.0,72...|  4.0|
|[0.0,39.0,2.0,62....|  0.0|
|[13.0,89.0,12.0,5...|  5.0|
|[74.0,87.0,31.0,1...|  9.0|
|[48.0,96.0,62.0,6...|  8.0|
|[100.0,100.0,72.0...|  5.0|
|[91.0,74.0,54.0,1...|  9.0|
|[0.0,85.0,38.0,10...|  7.0|
|[35.0,76.0,57.0,1...|  3.0|
|[50.0,84.0,66.0,1...|  3.0|
|[99.0,80.0,63.0,1...|  9.0|
|[24.0,66.0,43.0,1...|  2.0|
|[0.0,73.0,19.0,99...|  2.0|
|[12.0,77.0,20.0,6...|  5.0|
+--------------------+-----+
only showing top 20 rows



In [77]:
# Create Training and Test data.
pendtsets = penlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

In [145]:
# Train the data.
from pyspark.ml.classification import DecisionTreeClassifier
# Paramenters
#maxDepth : maximum tree depth (default : 5).
#maxBins : maximum number of bins when binning continuous features (default : 32).
#minInstancesPerNode : minimum number of dataset samples each branch needs to have after a split (default : 1).
#minInfoGain : minimum information gain for a split (default : 0).
dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)
dtmodel = dt.fit(pendttrain)

In [153]:
print dtmodel._call_java('toDebugString')

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4d259a68cb46f11069b6) of depth 20 with 609 nodes
  If (feature 13 <= 57.0)
   If (feature 4 <= 41.0)
    If (feature 9 <= 20.0)
     If (feature 14 <= 62.0)
      If (feature 10 <= 24.0)
       If (feature 1 <= 96.0)
        Predict: 8.0
       Else (feature 1 > 96.0)
        Predict: 6.0
      Else (feature 10 > 24.0)
       If (feature 5 <= 31.0)
        Predict: 4.0
       Else (feature 5 > 31.0)
        Predict: 6.0
     Else (feature 14 > 62.0)
      If (feature 15 <= 26.0)
       If (feature 10 <= 12.0)
        Predict: 1.0
       Else (feature 10 > 12.0)
        If (feature 10 <= 45.0)
         Predict: 2.0
        Else (feature 10 > 45.0)
         Predict: 4.0
      Else (feature 15 > 26.0)
       If (feature 12 <= 68.0)
        If (feature 0 <= 0.0)
         Predict: 7.0
        Else (feature 0 > 0.0)
         Predict: 8.0
       Else (feature 12 > 68.0)
        If (feature 10 <= 37.0)
         Predict: 6.0
        El

In [79]:
#Test data.
dtpredicts = dtmodel.transform(pendtvalid)

In [80]:
dtpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,42.0,18....|  9.0|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       9.0|
|[0.0,17.0,0.0,41....|  1.0|[0.0,28.0,0.0,0.0...|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,34.0,43.0,67...|  1.0|[0.0,107.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,38.0,31.0,57...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,40.0,29.0,56...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,41.0,47.0,58...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,43.0,26.0,65...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,43.0,35.0,60...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,45.0,38.0,54...|  1.0|[0.0,449.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,47.0,7.0,5

In [125]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtpredicts)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.0394867


In [127]:
dtpredicts.select('label','prediction').rdd.map(lambda x : (x,1)).countByKey() 

defaultdict(int,
            {Row(label=0.0, prediction=0.0): 207,
             Row(label=0.0, prediction=4.0): 1,
             Row(label=0.0, prediction=5.0): 1,
             Row(label=0.0, prediction=6.0): 1,
             Row(label=0.0, prediction=8.0): 1,
             Row(label=0.0, prediction=9.0): 1,
             Row(label=1.0, prediction=0.0): 1,
             Row(label=1.0, prediction=1.0): 179,
             Row(label=1.0, prediction=2.0): 5,
             Row(label=1.0, prediction=3.0): 1,
             Row(label=1.0, prediction=4.0): 1,
             Row(label=1.0, prediction=6.0): 3,
             Row(label=1.0, prediction=7.0): 2,
             Row(label=2.0, prediction=1.0): 6,
             Row(label=2.0, prediction=2.0): 193,
             Row(label=2.0, prediction=3.0): 1,
             Row(label=2.0, prediction=7.0): 2,
             Row(label=3.0, prediction=1.0): 3,
             Row(label=3.0, prediction=3.0): 175,
             Row(label=3.0, prediction=7.0): 2,
             Ro

In [128]:
#Depreciated in Spark 2.0 -- Use accuracy
from pyspark.mllib.evaluation import MulticlassMetrics
dtresrdd = dtpredicts.select("prediction", "label").rdd #convert DataFrame to RDD.
dtmm = MulticlassMetrics(dtresrdd) 
print dtmm.precision() 
print(dtmm.confusionMatrix())

0.960513326752
DenseMatrix([[ 207.,    0.,    0.,    0.,    1.,    1.,    1.,    0.,    1.,
                 1.],
             [   1.,  179.,    5.,    1.,    1.,    0.,    3.,    2.,    0.,
                 0.],
             [   0.,    6.,  193.,    1.,    0.,    0.,    0.,    2.,    0.,
                 0.],
             [   0.,    3.,    0.,  175.,    0.,    0.,    0.,    2.,    0.,
                 1.],
             [   1.,    1.,    0.,    1.,  206.,    2.,    1.,    0.,    1.,
                 2.],
             [   0.,    0.,    0.,    2.,    0.,  201.,    1.,    1.,    1.,
                 5.],
             [   1.,    0.,    0.,    0.,    0.,    0.,  188.,    0.,    0.,
                 0.],
             [   0.,    2.,    2.,    1.,    0.,    0.,    0.,  223.,    2.,
                 2.],
             [   0.,    0.,    1.,    0.,    1.,    2.,    0.,    1.,  201.,
                 1.],
             [   1.,    2.,    0.,    1.,    1.,    4.,    1.,    2.,    0.,
               17

In [141]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(dt).setEvaluator(evaluator).setNumFolds(5)
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5,10,15,20,25,30]).build()
#setEstimatorParamMaps() takes ParamGridBuilder().
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(pendttrain)
print cvmodel.bestModel._java_obj.getMaxDepth()
print "Accuracy : " +  str(MulticlassClassificationEvaluator().evaluate(cvmodel.bestModel.transform(pendtvalid)))

15
Accuracy : 0.960509288336


AttributeError: 'DecisionTreeClassificationModel' object has no attribute 'rdd'