# Custom Evaluators

Let's create the Mean Absolute Error evaluator from scratch:

In [2]:
from pyspark.sql.functions import * 
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredictionCol, \
    HasFeaturesCol
  
class MAEEvaluator(Evaluator, HasLabelCol, HasPredictionCol):
  def _evaluate(self, dataset):
    """
    Evaluates the output.

    :param dataset: a dataset that contains labels/observations and predictions
    :return: metric
    """
    labelCol = self.getLabelCol()
    predictionCol = self.getPredictionCol()
    return dataset.select(avg(abs(col(predictionCol)-col(labelCol)))).first()[0]

  def isLargerBetter(self):
    """
    Indicates whether the metric returned should be maximized
    (True, default) or minimized (False).
    A given evaluator may support multiple metrics which may be maximized or minimized.
    """
    return False

Now create and train a model:

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

baseballDF = (spark.read
            .option("header", True)
            .option("inferSchema", True)
            .csv("dbfs:/mnt/training/301/batting.csv"))

(testDF, trainingDF) = baseballDF.select("r","h","double","triple","hr","rbi", "bb").na.drop().randomSplit((0.20, 0.80), seed=42)
testDF.cache()
trainingDF.cache()

vecAssembler = VectorAssembler()
vecAssembler.setInputCols(["h", "double", "triple", "hr", "rbi", "bb"])
vecAssembler.setOutputCol("features")

lr = LinearRegression()
lr.setLabelCol("r")

lrPipeline = Pipeline()
lrPipeline.setStages([vecAssembler, lr])
lrPipelineModel = lrPipeline.fit(trainingDF)

predictionsDF = lrPipelineModel.transform(testDF)

...and evaluate the results:

In [6]:
eval = MAEEvaluator()
eval.setLabelCol("r")

#dir(eval)
eval.evaluate(predictionsDF)