# Chapter 10. Machine Learning with MLlib

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = (SparkSession.builder
  .master("local[4]")
  .appName("MachineLearning")
  .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")
spark

## Designing Machine Learning Pipelines

### Data Ingestion and Exploration

In [None]:
filePath = "../data/sf-airbnb/sf-airbnb-clean.parquet/"
airbnbDF = spark.read.parquet(filePath)
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", 
                "number_of_reviews", "price").show(5)

### Creating Training and Test Data Sets

In [None]:
trainDF, testDF = airbnbDF.randomSplit([.8, .2], seed=42)
print(f"""There are {trainDF.count()} rows in the training set, and {testDF.count()} in the test set""")

### Preparing Features with Transformers

In [None]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vecTrainDF = vecAssembler.transform(trainDF)
vecTrainDF.select("bedrooms", "features", "price").show(10)

### Using Estimators to Build Models

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features", labelCol="price")
lrModel = lr.fit(vecTrainDF)

In [None]:
# Inspect the learned parameters
m = round(lrModel.coefficients[0], 2)
b = round(lrModel.intercept, 2)
print(f"""The formula for the linear regression line is 
price = {m}*bedrooms + {b}""")

### Creating a Pipeline

In [None]:
# Create a pipelin
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

In [None]:
# Apply the model to the test data set
predDF = pipelineModel.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(10)

#### One-hot encoding

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in trainDF.dtypes 
                   if dataType == "string"]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, 
                              outputCols=indexOutputCols, 
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, 
                           outputCols=oheOutputCols)

numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]
assemblerInputs = oheOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, 
                               outputCol="features")

In [None]:
# Doing the same as in the previous cell, but using RFormula
from pyspark.ml.feature import RFormula

rFormula = RFormula(formula="price ~ .", 
                    featuresCol="features", 
                    labelCol="price", 
                    handleInvalid="skip")

In [None]:
# In Python
lr = LinearRegression(labelCol="price", featuresCol="features")
pipeline = Pipeline(stages = [stringIndexer, oheEncoder, vecAssembler, lr])
# Or use RFormula
# pipeline = Pipeline(stages = [rFormula, lr])

pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("features", "price", "prediction").show(5, truncate=False)

### Evaluating Models

#### RMSE

In [None]:
# In Python
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator = RegressionEvaluator(
  predictionCol="prediction", 
  labelCol="price", 
  metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE is {rmse:.1f}")

##### R^2

In [None]:
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

### Saving and Loading Models

In [None]:
pipelinePath = "../data_output/lr-pipeline-model"
pipelineModel.write().overwrite().save(pipelinePath)

In [None]:
from pyspark.ml import PipelineModel
savedPipelineModel = PipelineModel.load(pipelinePath)
savedPipelineModel.transform(testDF).select("features", "price", "prediction").show(5)

## Hyperparameter Tuning

### Tree-Based Models

#### Decision trees

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(labelCol="price")

# Filter for just numeric columns (and exclude price, our label)
numericCols = [field for (field, dataType) in trainDF.dtypes 
               if ((dataType == "double") & (field != "price"))]

# Combine output of StringIndexer defined above and numeric columns
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# Combine stages into pipeline
stages = [stringIndexer, vecAssembler, dt]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(trainDF) # This line should error

In [None]:
# Increase `maxBins` to handle the discretization of the categorical columns
dt.setMaxBins(40)
pipelineModel = pipeline.fit(trainDF)

In [None]:
# Extract the if-then-else rules learned by the decision tree
dtModel = pipelineModel.stages[-1]
print(dtModel.toDebugString)

In [None]:
# Extract the feature importance scores from our model to see the most important features
import pandas as pd

featureImp = pd.DataFrame(
  list(zip(vecAssembler.getInputCols(), dtModel.featureImportances)),
  columns=["feature", "importance"])
featureImp.sort_values(by="importance", ascending=False)

#### Random forests

In [None]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", maxBins=40, seed=42)

### k-Fold Cross-Validation

In [None]:
# Defining pipeline estimator
pipeline = Pipeline(stages = [stringIndexer, vecAssembler, rf])

In [None]:
# Set up our hyperparameter grid
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [2, 4, 6])
            .addGrid(rf.numTrees, [10, 100])
            .build())

In [None]:
# Define how to evaluate each of the models to determine which one performed best
evaluator = RegressionEvaluator(labelCol="price", 
                                predictionCol="prediction", 
                                metricName="rmse")

In [None]:
# Perform k-fold cross-validation
from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=pipeline, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    seed=42)

In [None]:
%%timeit
cvModel = cv.fit(trainDF)

In [None]:
# Inspect the results of the cross-validator
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

### Optimizing Pipelines

In [None]:
%%timeit
cvModel = cv.setParallelism(10).fit(trainDF)

In [None]:
cv = CrossValidator(estimator=rf, 
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    parallelism=10, 
                    seed=42)

pipeline = Pipeline(stages=[stringIndexer, vecAssembler, cv])

In [None]:
%%timeit
pipelineModel = pipeline.fit(trainDF)