In [239]:
import org.apache.spark.ml.{Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, CrossValidatorModel}
import org.apache.spark.ml.feature.{VectorAssembler, Imputer, OneHotEncoderEstimator, StringIndexer}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.functions.{sum, col, round}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.SaveMode

## Считывание данных

In [174]:
val trainDf = spark
    .read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("../data/train.csv")
val testDf = spark
    .read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("../data/test.csv")

trainDf = [PassengerId: int, Survived: int ... 10 more fields]
testDf = [PassengerId: int, Pclass: int ... 9 more fields]


[PassengerId: int, Pclass: int ... 9 more fields]

## Обзор данных

In [175]:
def viewDf(df:DataFrame) {
    println("Size = " + df.count)
    df.printSchema()
    df.show()
    println("Missing data:")
    df.select(df.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*).show
}

viewDf: (df: org.apache.spark.sql.DataFrame)Unit


### Тренировочные данные

In [176]:
viewDf(trainDf)

Size = 891
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    

### Тестовые данные

In [177]:
viewDf(testDf)

Size = 418
root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|34.5|    0|    0|          330911| 7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|          363272|    7.0| null|       S|
|        894|     2|Myles, Mr. Thomas

Видно, что в обоих выборках имеются пропущенные данные, причем признака _Cabin_ существенно не хватает, поэтому в дальнейшем иключим его из выборок. Признаки _Age_ и _Fare_ являются числовыми, поэтому пропущенные значения можно заменить, например, на среднее значение каждого признака. Признак _Embarked_ строковый и поскольку пропущенных значений немного, то заполним их значением _S_.

## Заполнение пропущенных данных

### Тренировочные данные

In [178]:
val ageImputer = new Imputer()
  .setInputCols(Array("Age"))
  .setOutputCols(Array("Age"))
  .setStrategy("mean")
var processedTrainDf = ageImputer.fit(trainDf).transform(trainDf)
processedTrainDf = processedTrainDf.withColumn("Age", round(col("Age")))
processedTrainDf = processedTrainDf.na.fill("S", Seq("Embarked"))
viewDf(processedTrainDf)                    

Size = 891
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = false)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|   

ageImputer = imputer_82b75944743b
processedTrainDf = [PassengerId: int, Survived: int ... 10 more fields]
processedTrainDf = [PassengerId: int, Survived: int ... 10 more fields]
processedTrainDf = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

### Тестовые данные

In [179]:
val ageFareImputer = new Imputer()
  .setInputCols(Array("Age", "Fare"))
  .setOutputCols(Array("Age", "Fare"))
  .setStrategy("mean")
var processedTestDf = ageFareImputer.fit(testDf).transform(testDf)
processedTestDf = processedTestDf.withColumn("Age", round(col("Age")))
processedTestDf = processedTestDf.withColumn("Fare", round(col("Fare"), 4))
viewDf(processedTestDf)  

Size = 418
root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|        892|     3|    Kelly, Mr. James|  male|35.0|    0|    0|          330911| 7.8292| null|       Q|
|        893|     3|Wilkes, Mrs. Jame...|female|47.0|    1|    0|          363272|    7.0| null|       S|
|        894|     2|Myles, Mr. Thomas

ageFareImputer = imputer_c35bd8c90ef5
processedTestDf = [PassengerId: int, Pclass: int ... 9 more fields]
processedTestDf = [PassengerId: int, Pclass: int ... 9 more fields]
processedTestDf = [PassengerId: int, Pclass: int ... 9 more fields]


[PassengerId: int, Pclass: int ... 9 more fields]

### Создание модели

In [232]:
val Array(train, test) = processedTrainDf.randomSplit(Array(0.7, 0.3))

train = [PassengerId: int, Survived: int ... 10 more fields]
test = [PassengerId: int, Survived: int ... 10 more fields]


[PassengerId: int, Survived: int ... 10 more fields]

In [233]:
val sexIndexer = new StringIndexer()
    .setInputCol("Sex")
    .setOutputCol("SexIndex")

val embarkedIndexer = new StringIndexer()
    .setInputCol("Embarked")
    .setOutputCol("EmbarkedIndex")

val encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("Pclass", "SexIndex", "SibSp", "Parch", "EmbarkedIndex"))
    .setOutputCols(Array("PclassVec", "SexVec", "SibSpVec", "ParchVec", "EmbarkedVec"))
    .setHandleInvalid("keep")

val assembler = new VectorAssembler()
    .setInputCols(Array("Age", "Fare", "PclassVec", "SexVec", "SibSpVec", "ParchVec", "EmbarkedVec"))
    .setOutputCol("features")

val lr = new LogisticRegression()
    .setFeaturesCol("features")
    .setLabelCol("Survived")

val lrParamGrid = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
    .build()

val eval = new BinaryClassificationEvaluator()
    .setLabelCol("Survived")

val lrCV = new CrossValidator()
    .setEstimator(new Pipeline().setStages(Array(sexIndexer, embarkedIndexer, encoder, assembler, lr)))
    .setEvaluator(eval)
    .setEstimatorParamMaps(lrParamGrid)
    .setNumFolds(5)

val model = lrCV.fit(train)

sexIndexer = strIdx_04da57af7458
embarkedIndexer = strIdx_cbae4957f320
encoder = oneHotEncoder_2fb83498d97f
assembler = vecAssembler_5b1bcf510eaa
lr = logreg_2d55b1500305
lrParamGrid = 
eval = binEval_39a232e39153


Array({
	logreg_2d55b1500305-regParam: 0.01
}, {
	logreg_2d55b1500305-regParam: 0.005
}, {
	logreg_2d55b1500305-regParam: 0.001
}, {
	logreg_2d55b1500305-regParam: 5.0E-4
}, {
	logreg_2d55b1500305-regParam: 1.0E-4
})
lrCV: org.apache.spark.m...


binEval_39a232e39153

## Результаты модели

In [234]:
println("cross-validated areaUnderROC: " + model.avgMetrics.max)
println("test areaUnderROC: " + eval.evaluate(model.transform(test)))

val lrPipelineModel = model.bestModel.asInstanceOf[PipelineModel]
println("regParam = " + lrPipelineModel.stages(4)
        .asInstanceOf[LogisticRegressionModel].getRegParam)

cross-validated areaUnderROC: 0.8557182998163542
test areaUnderROC: 0.8514349400947341
regParam = 0.01


lrPipelineModel = pipeline_263c2c78399f


pipeline_263c2c78399f

## Сохранение результатов

In [246]:
val scoredDf = model.transform(processedTestDf)
val outputDf = scoredDf.select("PassengerId", "prediction")
val castedDf = outputDf.select(outputDf("PassengerId"), outputDf("prediction").cast(IntegerType).as("Survived"))                                      
castedDf.write.format("csv").option("header", "true").save("../data/answer.csv")

scoredDf = [PassengerId: int, Pclass: int ... 20 more fields]
outputDf = [PassengerId: int, prediction: double]
castedDf = [PassengerId: int, Survived: int]


lastException: Throwable = null


[PassengerId: int, Survived: int]