## Spark ML GBTClassifier + Pipeline

In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark import StorageLevel

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Spark ML") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

---
## Pipelines

- http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml
- 복잡한 ML 과정들을 파이프라인으로 모듈화 시킬 수 있도록 도와주는 패키지

#### Transformer

- DataFrame을 **lazily** 하게 또 다른 DataFrame으로 변형, `transform()` 메서드 구현
- Feature Engineering에 필요한 알고리즘들뿐만 아니라, 이미 학습이 끝난 Model도 이에 해당

---
#### Estimator

- DataFrame을 model에 fitting 시키는 단계, 학습시키는 알고리즘이 모두 이에 해당
- 예를 들면 `LogisticRegression`은 `Estimator`에 해당
- `fit()` 함수를 호출하여 생성된 `LogisticRegressionModel`은 `Model`이자 `Transformer`

---
#### Pipeline

- ML을 돌리기 위해 필요한 stage를 연결시킨 구현체
- `Transformer`, `Estimator`가 Pipeline의 각 stage에 해당
- `Pipeline.fit()`을 호출하면 각 단계에서 지정한 함수를 순서대로 호출
- 마찬가지로 `PipelineModel`은 `fit()` 함수를 호출하여 생성된 `Model`

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

from pyspark.ml.pipeline import Pipeline
from pyspark.ml.feature import StandardScaler, VectorAssembler, Imputer, StringIndexer
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
# K-Fold value
fold = 3

# Read train, test dataset
inputCols = [
    'Pclass', 'age_im', 'SibSp', 'Parch', 'Fare', 'embarked_ix', 'sex_ix',
    'len_name'
]
str_length = udf(lambda x: len(x), IntegerType())

train = spark.read \
    .csv("../dataset/train.csv", header=True, inferSchema=True) \
    .withColumnRenamed("Survived", "label") \
    .withColumn('len_name', str_length(col('name'))) \
    .na.drop(subset=["Embarked", "Fare"]) \
    .cache()

test = spark.read \
    .csv("../dataset/test.csv", header=True, inferSchema=True) \
    .withColumnRenamed("Survived", "label") \
    .withColumn('len_name', str_length(col('name'))) \
    .na.drop(subset=["Embarked", "Fare"]) \
    .cache()

In [5]:
# Define operators
imputer = Imputer(inputCols=['Age'], outputCols=['age_im'], strategy='mean')
sex_ix = StringIndexer(inputCol='Sex', outputCol='sex_ix')
embarked_ix = StringIndexer(inputCol='Embarked', outputCol='embarked_ix')
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')
scaler = StandardScaler(inputCol='features',
                        outputCol='scaled_features',
                        withStd=True,
                        withMean=False)
model = GBTClassifier(labelCol='label',
                      featuresCol='scaled_features',
                      cacheNodeIds=True)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='label',
                                              metricName='accuracy')

# Pipeline
print("Make pipeline, model fitting...")
pipeline = Pipeline(
    stages=[imputer, sex_ix, embarked_ix, assembler, scaler, model])

# K-Fold Cross-validation with Parameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(model.maxDepth, [5, 7]) \
    .addGrid(model.maxIter, [20, 40]) \
    .addGrid(model.maxBins, [25]) \
    .addGrid(model.stepSize, [0.025]) \
    .addGrid(model.subsamplingRate, [0.7]) \
    .build()

# Fold 3 * Param 4 = 12
cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid,
                    numFolds=fold)

# Model training
cvModel = cv.fit(train)
bestModel = cvModel.bestModel

print("Model training finished!")
print("Cross-validation average score : {}".format(cvModel.avgMetrics[0]))
print("Best maxDepth parameters : {}".format(
    bestModel.stages[5]._java_obj.getMaxDepth()))
print("Best maxIter parameters : {}".format(
    bestModel.stages[5]._java_obj.getMaxIter()))
print("Best maxBins parameters : {}".format(
    bestModel.stages[5]._java_obj.getMaxBins()))

Make pipeline, model fitting...
Model training finished!
Cross-validation average score : 0.8117291437383991
Best maxDepth parameters : 5
Best maxIter parameters : 40
Best maxBins parameters : 25


In [6]:
# Save prediction result
predict = bestModel.transform(test)
predict.select("PassengerId", "prediction") \
    .coalesce(1).write.mode("overwrite") \
    .option("compression", "gzip") \
    .csv("../dataset/pred_titanic.csv.gzip", sep=",", header=True)

print("Save to csv finished!")

Save to csv finished!


In [7]:
# Feature Importance
featureImportance = bestModel.stages[-1].featureImportances.toArray()
print("Feature importance:\n{}\n".format("\n".join(
    map(lambda x: "{} = {}".format(str(x[0]), str(x[1])),
        zip(inputCols, featureImportance)))))

Feature importance:
Pclass = 0.0635219203849
age_im = 0.196579680469
SibSp = 0.0553845897311
Parch = 0.0267508377674
Fare = 0.240002914139
embarked_ix = 0.064995764243
sex_ix = 0.088147713686
len_name = 0.264616579579



In [8]:
spark.stop()