In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
import random
sc = pyspark.SparkContext(appName="xyz")

In [4]:
rawData = sc.textFile('C:\\Pulkit\\Learning\\Spark\\Spark ML\\datasets\\wine.data')

In [5]:
rawData.take(5)

['1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065',
 '1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050',
 '1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185',
 '1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480',
 '1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735']

In [7]:
from pyspark.mllib.regression import LabeledPoint

def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

In [8]:
parsedData = rawData.map(parsePoint)
parsedData

PythonRDD[3] at RDD at PythonRDD.scala:52

In [9]:
parsedData.take(5)

[LabeledPoint(1.0, [14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]),
 LabeledPoint(1.0, [13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0]),
 LabeledPoint(1.0, [13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0]),
 LabeledPoint(1.0, [14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0]),
 LabeledPoint(1.0, [13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0])]

In [11]:
(trainingData, testData) = parsedData.randomSplit([0.7,0.3] )

In [12]:
trainingData

PythonRDD[5] at RDD at PythonRDD.scala:52

In [13]:
trainingData.take(5)

[LabeledPoint(1.0, [14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]),
 LabeledPoint(1.0, [13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0]),
 LabeledPoint(1.0, [14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0]),
 LabeledPoint(1.0, [13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0]),
 LabeledPoint(1.0, [14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0])]

In [14]:
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainingData,
                                    numClasses=4,
                                    categoricalFeaturesInfo={},
                                    impurity='gini',
                                    maxDepth=3,
                                    maxBins=32)

In [15]:
predictions = model.predict(testData.map(lambda x: x.features))
predictions.take(5)

[1.0, 1.0, 1.0, 2.0, 2.0]

In [16]:
labelsAndPredictions = testData.map(lambda lp : lp.label).zip(predictions)
labelsAndPredictions.take(5)

[(1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (1.0, 2.0), (1.0, 2.0)]

In [20]:
testAcc = labelsAndPredictions.filter(
lambda lp: lp[0] == lp[1]).count() / float(testData.count())

print('Test Accuracy = ', testAcc)

Test Accuracy =  0.9074074074074074


In [22]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(labelsAndPredictions)

In [24]:
metrics.accuracy

0.9074074074074074

In [25]:
metrics.precision(1.0)

0.8

In [26]:
metrics.precision(3.0)

1.0

In [28]:
metrics.confusionMatrix().toArray()

array([[12.,  0.,  0.],
       [ 3., 17.,  0.],
       [ 0.,  2., 20.]])

In [30]:
print(model.toDebugString())

DecisionTreeModel classifier of depth 3 with 15 nodes
  If (feature 12 <= 745.0)
   If (feature 11 <= 2.19)
    If (feature 3 <= 16.05)
     Predict: 2.0
    Else (feature 3 > 16.05)
     Predict: 3.0
   Else (feature 11 > 2.19)
    If (feature 6 <= 0.51)
     Predict: 3.0
    Else (feature 6 > 0.51)
     Predict: 2.0
  Else (feature 12 > 745.0)
   If (feature 6 <= 2.1550000000000002)
    If (feature 1 <= 1.62)
     Predict: 2.0
    Else (feature 1 > 1.62)
     Predict: 3.0
   Else (feature 6 > 2.1550000000000002)
    If (feature 0 <= 13.04)
     Predict: 2.0
    Else (feature 0 > 13.04)
     Predict: 1.0

