In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext("local", "pyspark-shell")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Ensembles & Pipelines

## Pipeline

Pipelines will seriously steamline your workflow. They will also help to ensure that training and testing data are treated consistently and that no leakage of information between these two sets takes place.

A pipeline is a mechanism to combine series of steps rather than applying each of the steps individually.

### Flight duration model: Pipeline stages

In [6]:
from pyspark.sql.functions import round

flights = spark.read.csv("flights.csv", sep=",", header=True, inferSchema=True, nullValue="NA")
flights = flights.withColumn("km", round(flights.mile * 1.60934, 0)).drop("mile")

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression

indexer = StringIndexer(inputCol="org", outputCol="org_idx")
onehot = OneHotEncoder(inputCols=["org_idx", "dow"], outputCols=["org_dummy", "dow_dummy"])
assembler = VectorAssembler(inputCols=["km", "org_dummy", "dow_dummy"], outputCol="features")
regression = LinearRegression(labelCol="duration")

### Flight duration model: Pipeline model



In [8]:
flights_train, flights_test = flights.randomSplit([0.8,0.2], 13)

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
pipeline = pipeline.fit(flights_train)
predictions = pipeline.transform(flights_test)

In [12]:
predictions.select("features", "duration", "prediction").show(5, False)

+------------------------------+--------+------------------+
|features                      |duration|prediction        |
+------------------------------+--------+------------------+
|(14,[0,3,10],[4162.0,1.0,1.0])|385     |377.34661750108603|
|(14,[0,3,10],[3983.0,1.0,1.0])|379     |364.04381688061255|
|(14,[0,1,10],[1180.0,1.0,1.0])|130     |131.59537849620068|
|(14,[0,3,10],[2570.0,1.0,1.0])|230     |259.0334410329645 |
|(14,[0,1,10],[378.0,1.0,1.0]) |64      |71.99288633072742 |
+------------------------------+--------+------------------+
only showing top 5 rows



### SMS spam pipeline

In [56]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="terms")
hasher = HashingTF(inputCol="terms", outputCol="hash")
idf = IDF(inputCol="hash", outputCol="features")
logistic = LogisticRegression()

pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

In [57]:
from pyspark.sql.functions import regexp_replace

sms = spark.read.csv("sms.csv", sep=";",header=True, inferSchema=True)
sms = sms.withColumn("text", regexp_replace(sms.text, '[_():;,.!?\\-]', " "))
sms = sms.withColumn("text", regexp_replace(sms.text, "[0-9]", " "))
sms = sms.withColumn("text", regexp_replace(sms.text, " +", " "))
sms = sms.withColumn("text", regexp_replace(sms.text, "I", "i"))

sms_train, sms_test = sms.randomSplit([0.8, 0.2], 13)

pipeline = pipeline.fit(sms_train)
predictions = pipeline.transform(sms_test)

In [58]:
predictions.select("text", "label", "prediction").show(5)

+--------------------+-----+----------+
|                text|label|prediction|
+--------------------+-----+----------+
|Dont worry i gues...|    0|       0.0|
|Ok lar Joking wif...|    0|       0.0|
|WiNNER As a value...|    1|       1.0|
|England v Macedon...|    1|       0.0|
|is that seriously...|    0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows



In [60]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()
evaluator.evaluate(predictions)

0.1837365616517062

## Cross-Validation

Simple training and test data  approach has one major drawback which is only getting one estimate of the model performance. You would have more robust idea of how well a model works if you were able to test it multiple times. This is the ideo behind cross-validation. Splitting **training data** into folds, selecting one of them and using as training set the rest, repeat the process for number of fold times. Then you can calculate the average of the evaluation metric over all folds, which is a much more ronust measure of model performance than a single value.

You need CrossValidator and ParamGridBuilder from pyspark.ml.tuning for this process.

### Cross validating simple flight duration model

Cross-validation provides a much better way to evaluate model performance.

In [31]:
flights_km = flights.select("km", "duration")

assembler= VectorAssembler(inputCols=["km"], outputCol="features")
flights_km = assembler.transform(flights_km)

flights_train, flights_test = flights_km.randomSplit([0.8,0.2],13)

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


params = ParamGridBuilder().build()
regression = LinearRegression(labelCol="duration")
evaluator = RegressionEvaluator(labelCol="duration")

cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)
cv = cv.fit(flights_train)

prediction = cv.transform(flights_test)
prediction.show()

+-----+--------+--------+------------------+
|   km|duration|features|        prediction|
+-----+--------+--------+------------------+
|108.0|      43| [108.0]|52.547663297131706|
|108.0|      44| [108.0]|52.547663297131706|
|108.0|      46| [108.0]|52.547663297131706|
|108.0|      46| [108.0]|52.547663297131706|
|108.0|      46| [108.0]|52.547663297131706|
|108.0|      47| [108.0]|52.547663297131706|
|108.0|      47| [108.0]|52.547663297131706|
|108.0|      47| [108.0]|52.547663297131706|
|108.0|      47| [108.0]|52.547663297131706|
|108.0|      48| [108.0]|52.547663297131706|
|108.0|      49| [108.0]|52.547663297131706|
|108.0|      49| [108.0]|52.547663297131706|
|108.0|      49| [108.0]|52.547663297131706|
|108.0|      49| [108.0]|52.547663297131706|
|108.0|      51| [108.0]|52.547663297131706|
|108.0|      53| [108.0]|52.547663297131706|
|108.0|      53| [108.0]|52.547663297131706|
|124.0|      39| [124.0]| 53.75725524710285|
|124.0|      39| [124.0]| 53.75725524710285|
|124.0|   

In [35]:
cv.avgMetrics

[17.027105424959803]

In [36]:
evaluator.evaluate(cv.transform(flights_test))

17.518010870137793

### Cross validating flight duration model pipeline


In [41]:
indexer = StringIndexer(inputCol="org", outputCol="org_idx")
onehot = OneHotEncoder(inputCols=["org_idx"], outputCols=["org_dummy"])
assembler = VectorAssembler(inputCols=["km", "org_dummy"], outputCol="features")

pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,
                   estimatorParamMaps=params,
                   evaluator=evaluator)

flights_train, flights_test = flights.randomSplit([0.8, 0.2], 13)

cv = cv.fit(flights_train)
print(cv.avgMetrics)

prediction = cv.transform(flights_test)
print(evaluator.evaluate(cv.transform(flights_test)))

[11.271797559154411]
11.012777355873434


## Grid Search

Models can be improved by choosing better model parameters. The optimal choice of paramaters will depend on the data and the modeling goal. For example setting fitIntercept as True or False. It'd be better to make this comparison using cross-validation and automated. 

You can systematically evaluate a model across a grid of parameter values using a technique known as grid search. First you create a grid builder and then you add one or more grids such as fitIntercept. Call the build() method to construct the grid. After creating a cross-validator object and fit it to the training data you can retrieve the best model using the bestModel attribut of cv. But it's not necessary since the cv object will behave like the best model. So you can use it directly to make predictions on the testing data. You can retrieve the best parameter value by using explainParam method.

The more parameter and values you add to the grid, the more models you have to evaluate.

### Optimizing flights linear regression


In [47]:
params = ParamGridBuilder()
params = params.addGrid(regression.regParam,[0.01, 0.1, 1, 10]).addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])
params = params.build()
print("Number of models to be tested: ", len(params))
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)


Number of models to be tested:  12


### Dissecting the best flight duration model

In [51]:
cv = cv.fit(flights_train)

best_model = cv.bestModel
print(best_model.stages)

best_model.stages[3].extractParamMap()

predictions = best_model.transform(flights_test)
evaluator.evaluate(predictions)

[StringIndexerModel: uid=StringIndexer_8da5c08a38e3, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_cb76bce9219f, dropLast=true, handleInvalid=error, numInputCols=1, numOutputCols=1, VectorAssembler_29d0c071636a, LinearRegressionModel: uid=LinearRegression_665672928f84, numFeatures=8]


11.012865738560498

In [52]:
best_model.stages[3].extractParamMap()

{Param(parent='LinearRegression_665672928f84', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LinearRegression_665672928f84', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LinearRegression_665672928f84', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_665672928f84', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_665672928f84', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_665672928f84', name='labelCol', doc='label column name.'): 'duration',
 Param(parent='LinearRegression_665672928f84', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 'squaredEr

### SMS spam optimised

In [61]:
params = ParamGridBuilder()
params = params.addGrid(hasher.numFeatures, [1024, 4096,16384]).addGrid(hasher.binary, [True, False])
params = params.addGrid(logistic.regParam, [0.01, 0.1, 1.0, 10.0,]).addGrid(logistic.elasticNetParam, [0.0, 0.5, 1.0 ])
params = params.build()

In [62]:
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
evaluator = RegressionEvaluator()
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

cv = cv.fit(sms_train)
predictions = cv.transform(sms_test)

In [63]:
print(evaluator.evaluate(cv.transform(sms_test)))

0.1654456490020896


In [71]:
cv.bestModel.stages[2].extractParamMap()

{Param(parent='HashingTF_15beaa28d9d2', name='binary', doc='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.'): True,
 Param(parent='HashingTF_15beaa28d9d2', name='numFeatures', doc='Number of features. Should be greater than 0.'): 4096,
 Param(parent='HashingTF_15beaa28d9d2', name='outputCol', doc='output column name.'): 'hash',
 Param(parent='HashingTF_15beaa28d9d2', name='inputCol', doc='input column name.'): 'terms'}

## Ensemble

Models can be combined to form a collections or "ensemble" which is more powerful tahn each of the individual models alone. An ensemble combines the results from multiple models to produce better predicions than any of those models acting alone. A successful ensemble requires diverse models. Ideally each of the models in the ensemble should be different.

A Random Forest is a collections of trees. Each tree trained on random subset of data within each tree a random subset of features is used for splitting at each node. 

You can make predictions useing each tree individually. In some cases all of the trees agree, but there is often some dissent amongst the models. Transform() method will automatically generate a consensus prediction column. You can see importance of features by using the **featureImportances** attribute. 

Gradient-Boosted Trees is another ensemble model. Rather tahn building a set of trees that operate in parallel, now we build trees which work in series. The boosting algorithm works iteratively and focuses on improving the incorrect predictions by trying another tree. As trees are added to the ensemble its predictions improve.

AUC scores of **Random Forest** and **Greadient-Boosterd Tree** should be better than plain **Decision Tree**.

### Delayed flights with Gradient-Boosted Trees


In [82]:
flights = flights.dropna(subset=["delay"])
flights = flights.withColumn("label", (flights.delay >=15).cast("integer"))

assembler = VectorAssembler(inputCols=["mon", "depart", "duration"], outputCol="features")

flights_labeled = assembler.transform(flights)
flights_labeled.select("mon", "depart", "duration", "features", "label").show()

+---+------+--------+-----------------+-----+
|mon|depart|duration|         features|label|
+---+------+--------+-----------------+-----+
|  0| 16.33|      82| [0.0,16.33,82.0]|    1|
|  2|  6.17|      82|  [2.0,6.17,82.0]|    0|
|  9| 10.33|     195|[9.0,10.33,195.0]|    0|
|  5|  7.98|     102| [5.0,7.98,102.0]|    0|
|  7| 10.83|     135|[7.0,10.83,135.0]|    1|
|  1|   8.0|     232|  [1.0,8.0,232.0]|    0|
|  1|  7.98|     250| [1.0,7.98,250.0]|    0|
| 11|  7.77|      60| [11.0,7.77,60.0]|    1|
|  4| 13.25|     210|[4.0,13.25,210.0]|    0|
|  4| 13.75|     160|[4.0,13.75,160.0]|    1|
|  8| 13.28|     151|[8.0,13.28,151.0]|    1|
|  3|   9.0|     264|  [3.0,9.0,264.0]|    0|
|  0| 17.08|     190|[0.0,17.08,190.0]|    1|
|  5|  12.7|     158| [5.0,12.7,158.0]|    1|
|  3| 17.58|     265|[3.0,17.58,265.0]|    1|
| 11|  6.75|     160|[11.0,6.75,160.0]|    1|
|  8|  6.33|     160| [8.0,6.33,160.0]|    1|
|  2|  6.17|     166| [2.0,6.17,166.0]|    0|
|  7|  19.0|     110| [7.0,19.0,11

In [87]:
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

flights_train, flights_test = flights_labeled.randomSplit([0.8, 0.2], 13)

tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

evaluator = BinaryClassificationEvaluator()
print(evaluator.evaluate(tree.transform(flights_test)))
print(evaluator.evaluate(gbt.transform(flights_test)))

print(gbt.trees)
print()
print(gbt.featureImportances)

0.6272166509112471
0.6801100665589925
[DecisionTreeRegressionModel: uid=dtr_b4b18500a002, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_564ab70ab8ff, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_d9929412c476, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_42808eda1715, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_302f97a7c249, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_4281b68c194f, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_24f87a113105, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_f64588ff4af8, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_c262e3d031c6, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_eead2207cd78, depth=5, numNodes=63, numFeatures=3, DecisionTreeRegressionModel: uid=dtr_c6a365fdae3d, depth=5, numNodes=63, numFeatu

### Delayed flights with a Random Forest

In [91]:
from pyspark.ml.classification import RandomForestClassifier

forest = RandomForestClassifier()

params = ParamGridBuilder().addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']).addGrid(forest.maxDepth, [2, 5, 10]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=forets, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

### Evaluating Random Forest

In [93]:
cv = cv.fit(flights_train)

print(cv.avgMetrics,"\n")

print(cv.bestModel.explainParam("maxDepth"),"\n")
print(cv.bestModel.explainParam("featureSubsetStrategy"),"\n")

print(evaluator.evaluate(cv.transform(flights_test)),"\n")

[0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031, 0.6638777431038031] 

maxDepth: Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. (default: 5) 

featureSubsetStrategy: The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto' (default: auto) 

0.6666110163587886 

