In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
sqlContext = SQLContext(sc)
predictions = sqlContext.read.load('file:///home/cloudera/Downloads/big-data-4/predictions.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')

In [4]:
# Let's create an instance of MulticlassClassificationEvaluator to 
# determine the accuracy of the predictions:

evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',
                                             metricName='precision')
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))

Accuracy = 0.809524 


In [5]:
# The MulticlassMetrics class can be used to generate a confusion matrix 
# of our classifier model. However, unlike MulticlassClassificationEvaluator, 
# MulticlassMetrics works with RDDs of numbers and not DataFrames, so we need 
# to convert our predictions DataFrame into an RDD.

# If we use the rdd attribute of predictions, we see this is an RDD of Rows:

predictions.rdd.take(2)

[Row(prediction=1.0, label=1.0), Row(prediction=1.0, label=1.0)]

In [6]:
# Instead, we can map the RDD to tuple to get an RDD of numbers:

predictions.rdd.map(tuple).take(2)

[(1.0, 1.0), (1.0, 1.0)]

In [7]:
# Let's create an instance of MulticlassMetrics with this RDD:

metrics = MulticlassMetrics(predictions.rdd.map(tuple))

In [8]:
# The confusionMatrix() function returns a Spark Matrix, 
# which we can convert to a Python Numpy array, and transpose to view:

metrics.confusionMatrix().toArray().transpose()

array([[ 87.,  26.],
       [ 14.,  83.]])