In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

## Create Data Frame

In [2]:
#Load the data and create an RDD (16 pixels and label)
pen_raw = sc.textFile("../Data/penbased.dat", 4).map(lambda x:  x.split(", ")).map(lambda row: [float(x) for x in row])
pen_raw.take(1)

[[47.0,
  100.0,
  27.0,
  81.0,
  57.0,
  37.0,
  26.0,
  0.0,
  0.0,
  23.0,
  56.0,
  53.0,
  100.0,
  90.0,
  40.0,
  98.0,
  8.0]]

In [3]:
#Create a DataFrame
from pyspark.sql.types import *
from pyspark.sql import Row
penschema = StructType([
    StructField("pix1",DoubleType(),True),
    StructField("pix2",DoubleType(),True),
    StructField("pix3",DoubleType(),True),
    StructField("pix4",DoubleType(),True),
    StructField("pix5",DoubleType(),True),
    StructField("pix6",DoubleType(),True),
    StructField("pix7",DoubleType(),True),
    StructField("pix8",DoubleType(),True),
    StructField("pix9",DoubleType(),True),
    StructField("pix10",DoubleType(),True),
    StructField("pix11",DoubleType(),True),
    StructField("pix12",DoubleType(),True),
    StructField("pix13",DoubleType(),True),
    StructField("pix14",DoubleType(),True),
    StructField("pix15",DoubleType(),True),
    StructField("pix16",DoubleType(),True),
    StructField("label",DoubleType(),True)
])

dfpen = ss.createDataFrame(pen_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15],x[16])), penschema)

In [4]:
dfpen.show()

+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| pix1| pix2|pix3| pix4| pix5| pix6| pix7| pix8| pix9|pix10|pix11|pix12|pix13|pix14|pix15|pix16|label|
+-----+-----+----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
| 47.0|100.0|27.0| 81.0| 57.0| 37.0| 26.0|  0.0|  0.0| 23.0| 56.0| 53.0|100.0| 90.0| 40.0| 98.0|  8.0|
|  0.0| 89.0|27.0|100.0| 42.0| 75.0| 29.0| 45.0| 15.0| 15.0| 37.0|  0.0| 69.0|  2.0|100.0|  6.0|  2.0|
|  0.0| 57.0|31.0| 68.0| 72.0| 90.0|100.0|100.0| 76.0| 75.0| 50.0| 51.0| 28.0| 25.0| 16.0|  0.0|  1.0|
|  0.0|100.0| 7.0| 92.0|  5.0| 68.0| 19.0| 45.0| 86.0| 34.0|100.0| 45.0| 74.0| 23.0| 67.0|  0.0|  4.0|
|  0.0| 67.0|49.0| 83.0|100.0|100.0| 81.0| 80.0| 60.0| 60.0| 40.0| 40.0| 33.0| 20.0| 47.0|  0.0|  1.0|
|100.0|100.0|88.0| 99.0| 49.0| 74.0| 17.0| 47.0|  0.0| 16.0| 37.0|  0.0| 73.0| 16.0| 20.0| 20.0|  6.0|
|  0.0|100.0| 3.0| 72.0| 26.0| 35.0| 85.0| 35.0|100.0| 71.0| 73.0| 97.0| 

## Create a data frame includes "feature" and "label"

In [5]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(outputCol="features", inputCols=dfpen.columns[0:-1]) #except the last col.
penlpoints = va.transform(dfpen).select("features", "label")

In [6]:
penlpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[47.0,100.0,27.0,...|  8.0|
|[0.0,89.0,27.0,10...|  2.0|
|[0.0,57.0,31.0,68...|  1.0|
|[0.0,100.0,7.0,92...|  4.0|
|[0.0,67.0,49.0,83...|  1.0|
|[100.0,100.0,88.0...|  6.0|
|[0.0,100.0,3.0,72...|  4.0|
|[0.0,39.0,2.0,62....|  0.0|
|[13.0,89.0,12.0,5...|  5.0|
|[74.0,87.0,31.0,1...|  9.0|
|[48.0,96.0,62.0,6...|  8.0|
|[100.0,100.0,72.0...|  5.0|
|[91.0,74.0,54.0,1...|  9.0|
|[0.0,85.0,38.0,10...|  7.0|
|[35.0,76.0,57.0,1...|  3.0|
|[50.0,84.0,66.0,1...|  3.0|
|[99.0,80.0,63.0,1...|  9.0|
|[24.0,66.0,43.0,1...|  2.0|
|[0.0,73.0,19.0,99...|  2.0|
|[12.0,77.0,20.0,6...|  5.0|
+--------------------+-----+
only showing top 20 rows



## Create Training and Test data.

In [7]:
# Create Training and Test data.
pendtsets = penlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

## Train the decision tree model

In [8]:
# Train the data.
from pyspark.ml.classification import DecisionTreeClassifier
# Paramenters
#maxDepth : maximum tree depth (default : 5).
#maxBins : maximum number of bins when binning continuous features (default : 32).
#minInstancesPerNode : minimum number of dataset samples each branch needs to have after a split (default : 1).
#minInfoGain : minimum information gain for a split (default : 0).
dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0)
dtmodel = dt.fit(pendttrain)

In [9]:
print(dtmodel._call_java('toDebugString'))

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_58fa3e97326b) of depth 18 with 589 nodes
  If (feature 15 <= 51.5)
   If (feature 4 <= 41.5)
    If (feature 9 <= 17.5)
     If (feature 14 <= 63.5)
      If (feature 0 <= 16.5)
       Predict: 1.0
      Else (feature 0 > 16.5)
       If (feature 5 <= 31.5)
        Predict: 4.0
       Else (feature 5 > 31.5)
        Predict: 6.0
     Else (feature 14 > 63.5)
      If (feature 12 <= 55.5)
       If (feature 0 <= 38.5)
        Predict: 1.0
       Else (feature 0 > 38.5)
        Predict: 8.0
      Else (feature 12 > 55.5)
       If (feature 13 <= 8.5)
        Predict: 2.0
       Else (feature 13 > 8.5)
        If (feature 4 <= 31.5)
         Predict: 4.0
        Else (feature 4 > 31.5)
         If (feature 0 <= 0.5)
          Predict: 7.0
         Else (feature 0 > 0.5)
          Predict: 6.0
    Else (feature 9 > 17.5)
     If (feature 1 <= 99.5)
      If (feature 13 <= 59.5)
       If (feature 9 <= 62.5)
        If (feature 14 

## Test the model

In [10]:
#Test data.
dtpredicts = dtmodel.transform(pendtvalid)

In [11]:
dtpredicts.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.0,18.0,27....|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,34.0,25.0,52...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,39.0,42.0,52...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,42.0,10.0,60...|  1.0|[0.0,16.0,0.0,0.0...|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,44.0,36.0,58...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,45.0,27.0,61...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,48.0,28.0,64...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,51.0,31.0,60...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,52.0,33.0,67...|  1.0|[0.0,405.0,0.0,0....|[0.0,1.0,0.0,0.0,...|       1.0|
|[0.0,53.0,36.0,

## Evaluate the model
available metrics : https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html

In [12]:
from pyspark.mllib.evaluation import MulticlassMetrics

#prediction and label
prediction_label = dtpredicts.select("prediction", "label").rdd

metrics = MulticlassMetrics(prediction_label)

precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
confusionMetrics = metrics.confusionMatrix()

print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
print("Confusion Metrics = \n%s" % confusionMetrics)

Summary Stats
Precision = 0.9596814335490294
Recall = 0.9596814335490294
F1 Score = 0.9596814335490294
Weighted recall = 0.9596814335490295
Weighted precision = 0.9598341699466577
Weighted F(1) Score = 0.9596961487461645
Weighted F(0.5) Score = 0.9597641689474713
Weighted false positive rate = 0.00445923225347843
Confusion Metrics = 
DenseMatrix([[202.,   0.,   0.,   0.,   3.,   0.,   1.,   0.,   4.,   0.],
             [  1., 196.,   3.,   0.,   0.,   2.,   0.,   3.,   0.,   1.],
             [  0.,   7., 206.,   0.,   2.,   0.,   1.,   0.,   0.,   0.],
             [  0.,   2.,   0., 178.,   0.,   5.,   0.,   0.,   0.,   1.],
             [  0.,   0.,   0.,   0., 197.,   0.,   0.,   0.,   0.,   4.],
             [  0.,   0.,   0.,   2.,   1., 178.,   0.,   1.,   0.,   5.],
             [  0.,   3.,   0.,   0.,   0.,   0., 202.,   0.,   0.,   0.],
             [  0.,   1.,   1.,   2.,   0.,   0.,   1., 192.,   1.,   1.],
             [  4.,   0.,   0.,   0.,   0.,   0.,   0.,   2., 18

## N-fold validation
cross-validation : https://spark.apache.org/docs/latest/ml-tuning.html#cross-validation


In [13]:
# n-fold validation and the results.
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
#ParamGridBuilder() – combinations of parameters and their values.
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5,10,15,20,25,30]).build()
cv = CrossValidator(estimator=dt, evaluator=evaluator, numFolds=5, estimatorParamMaps=paramGrid)

cvmodel = cv.fit(pendttrain)
print("Best Max Depth : %s" % cvmodel.bestModel._java_obj.getMaxDepth())
print("Accuracy : %s" % MulticlassClassificationEvaluator().evaluate(cvmodel.bestModel.transform(pendtvalid)))

Best Max Depth : 20
Accuracy : 0.9596961487461645
