In [89]:
import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, 
                                           DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.feature.{VectorAssembler, ChiSqSelector, ChiSqSelectorModel}

## Загрузка данных

In [2]:
val df = spark
    .read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("../data/ml_dataset.csv")
val Array(train, test) = df.randomSplit(Array(0.7, 0.3))

df = [feature_0: double, feature_1: double ... 99 more fields]
train = [feature_0: double, feature_1: double ... 99 more fields]
test = [feature_0: double, feature_1: double ... 99 more fields]


[feature_0: double, feature_1: double ... 99 more fields]

## Построение _Pipeline_

In [95]:
// Объединение всех столбцов (кроме label) в один столбец features
val assembler = new VectorAssembler()
  .setInputCols(df.columns.slice(0, df.columns.size - 1))
  .setOutputCol("features")

// Выбор наиболее релевантных признаков (столбец bestFeatures)
val selector = new ChiSqSelector()
  .setFeaturesCol("features")
  .setLabelCol("label")
  .setOutputCol("bestFeatures")

// Логистическая регрессия
val lr = new LogisticRegression()
    .setFeaturesCol("bestFeatures")
    .setLabelCol("label")

// Дерево принятия решений
val dt = new DecisionTreeClassifier()
    .setFeaturesCol("bestFeatures")
    .setLabelCol("label")

// Объявим пустой pipeline, поскольку параметр stages будет
// добавлен как часть param grid. 
val pipeline = new Pipeline()

// Посртроим массив stages и param grid для каждого pipeline 
val dtPipeline = Array[PipelineStage](assembler, selector, dt)
val lrPipeline = Array[PipelineStage](assembler, selector, lr)

val lrParamGrid = new ParamGridBuilder()
    .baseOn(pipeline.stages -> lrPipeline)
    .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
    .build()

val dtParamGrid = new ParamGridBuilder()
    .baseOn(pipeline.stages -> dtPipeline)
    .addGrid(selector.numTopFeatures, (5 to 100 by 5).toArray)
    .addGrid(dt.maxDepth, Array(5, 10, 20, 25, 30))
    .build()

// Поскольку ParamGridBuilder строит массив со всеми комбинациями 
// значений параметров, то можно объединить получившиеся массивы,
// чтобы использовать param grid в кросс-валидации.
val paramGrid = lrParamGrid ++ dtParamGrid

// Кросс-валидация. В качестве Evaluator используем 
// BinaryClassificationEvaluator, чтобы сравнивать алгоритмы
// с помощью метрики качества AUC.
val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(5)
  .setParallelism(2)

// Обучим pipeline на тренировочных данных
val bestModel = cv.fit(train)

assembler = vecAssembler_3e4ce6015641
selector = chiSqSelector_0dc70a08c59b
lr = logreg_93aac47a6711
dt = dtc_d55867357d5c
pipeline = pipeline_914e39562cf3
dtPipeline = Array(vecAssembler_3e4ce6015641, chiSqSelector_0dc70a08c59b, dtc_d55867357d5c)
lrPipeline = Array(vecAssembler_3e4ce6015641, chiSqSelector_0dc70a08c59b, logreg_93aac47a6711)
lrParamGrid = 


Array({
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_93aac47a6711-regPa...


### Параметры после кросс-валидации
Лучшим в данном случае оказалось Дерево принятия решений

In [97]:
val bestPipelineModel = bestModel.bestModel.asInstanceOf[PipelineModel]
println("numTopFeatures = " + bestPipelineModel.stages(1)
        .asInstanceOf[ChiSqSelectorModel].getNumTopFeatures)
println("maxDepth = " + bestPipelineModel.stages(2)
        .asInstanceOf[DecisionTreeClassificationModel].getMaxDepth)

numTopFeatures = 100
maxDepth = 20


bestPipelineModel = pipeline_914e39562cf3


lastException: Throwable = null


pipeline_914e39562cf3

### Значение AUC

In [98]:
println("cross-validated areaUnderROC: " + bestModel.avgMetrics.max)
println("test areaUnderROC: " + eval.evaluate(bestModel.transform(test)))

cross-validated areaUnderROC: 0.9015878663014567
test areaUnderROC: 0.8874074084056929


## Кросс-валидация для каждого классификатора
Запустим кросс-валидацию для каждого pipeline в отдельности и сравним результаты

In [30]:
val tree = new DecisionTreeClassifier()
    .setFeaturesCol("bestFeatures")
    .setLabelCol("label")

val treeParamGrid = new ParamGridBuilder()
    .addGrid(selector.numTopFeatures, (5 to 100 by 5).toArray)
    .addGrid(tree.maxDepth, Array(5, 10, 20, 25, 30))
    .build()

val treeCV = new CrossValidator()
  .setEstimator(new Pipeline().setStages(Array(assembler, selector, tree)))
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(treeParamGrid)
  .setNumFolds(5)

val treeCVModel = treeCV.fit(train)

println("cross-validated areaUnderROC: " + treeCVModel.avgMetrics.max)
println("test areaUnderROC: " + eval.evaluate(treeCVModel.transform(test)))

cross-validated areaUnderROC: 0.9015878663014567
test areaUnderROC: 0.8874074084056929


tree = dtc_45c25a976099
treeParamGrid = 


lastException: Throwable = null
Array({
	dtc_45c25a976099-maxDepth: 5,
	chiSqSelector_6792f86bd77b-numTopFeatures: 5
}, {
	dtc_45c25a976099-maxDepth: 10,
	chiSqSelector_6792f86bd77b-numTopFeatures: 5
}, {
	dtc_45c25a976099-maxDepth: 20,
	chiSqSelector_6792f86bd77b-numTopFeatures: 5
}, {
	dtc_45c25a976099-maxDepth: 25,
	chiSqSelector_6792f86bd77b-numTopFeatures: 5
}, {
	dtc_45c25a976099-maxDepth: 30,
	chiSqSelector_6792f86bd77b-numTopFeatures: 5
}, {
	dtc_45c25a976099-maxDepth: 5,
	chiSqSelector_6792f86bd77b-numTopFeatures: 10
}, {
	dtc_45c25a976099-maxDepth: 10,
	chiSqSelector_6792f86bd77b-numTopFeatures: 10
}, {
	dtc_45c25a976099-maxDepth: 20,
	chiSqSelector_6792f86bd77b-numTop...


In [92]:
val treePipelineModel = treeCVModel.bestModel.asInstanceOf[PipelineModel]
println("numTopFeatures = " + treePipelineModel.stages(1)
        .asInstanceOf[ChiSqSelectorModel].getNumTopFeatures)
println("maxDepth = " + treePipelineModel.stages(2)
        .asInstanceOf[DecisionTreeClassificationModel].getMaxDepth)

numTopFeatures = 100
maxDepth = 20


treePipelineModel = pipeline_07a0f0470f33


pipeline_07a0f0470f33

In [103]:
val lr1 = new LogisticRegression()
    .setFeaturesCol("bestFeatures")
    .setLabelCol("label")

val lrParamGrid1 = new ParamGridBuilder()
    .addGrid(selector.numTopFeatures, (5 to 100 by 5).toArray)
    .addGrid(lr1.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
    .build()

val lrCV = new CrossValidator()
    .setEstimator(new Pipeline().setStages(Array(assembler, selector, lr1)))
    .setEvaluator(new BinaryClassificationEvaluator)
    .setEstimatorParamMaps(lrParamGrid1)
    .setNumFolds(5)

val lrCVModel = lrCV.fit(train)

println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max)
println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(test)))

cross-validated areaUnderROC: 0.8654721967204286
test areaUnderROC: 0.8582410281221541


lr1 = logreg_9f24d207d879
lrParamGrid1 = 


Array({
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_9f24d207d879-regParam: 0.01
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_9f24d207d879-regParam: 0.005
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_9f24d207d879-regParam: 0.001
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_9f24d207d879-regParam: 5.0E-4
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 5,
	logreg_9f24d207d879-regParam: 1.0E-4
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 10,
	logreg_9f24d207d879-regParam: 0.01
}, {
	chiSqSelector_0dc70a08c59b-numTopFeatures: 10,
	logreg_9f24d207d879-regParam: 0.005
}, {
	chiSqSelector_0dc70a08c5...


In [104]:
val lrPipelineModel = lrCVModel.bestModel.asInstanceOf[PipelineModel]
println("numTopFeatures = " + lrPipelineModel.stages(1)
        .asInstanceOf[ChiSqSelectorModel].getNumTopFeatures)
println("regParam = " + lrPipelineModel.stages(2)
        .asInstanceOf[LogisticRegressionModel].getRegParam)

numTopFeatures = 40
regParam = 0.005


pipeline_34b0a0d5f944

lrPipelineModel = pipeline_34b0a0d5f944


Видно, что у дерева принятия решений значение _AUC_ на тестовой выборке больше, чем у линейной регрессии. Но разница со значением _AUC_ на тренировочных данных может говорить о переобученности дерева.