In [1]:
from os.path import expanduser

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter3/'


In [2]:
from pyspark.context import SparkContext

sc = SparkContext('local', 'test')


In [3]:
from pyspark.sql import SQLContext

sql = SQLContext(sc)


In [59]:
housing_df = sql.read.csv(SRC_PATH + 'train.csv', header=True, inferSchema=True)


In [60]:
housing_df.show(4)


+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|  5|0.06905| 0.0| 2.18|   0|0.458|7.147|54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 4 rows



In [61]:
from pyspark.ml.feature import VectorAssembler

training_features = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'lstat']
vector_assembler = VectorAssembler(inputCols=training_features, outputCol="features")


In [63]:
df_with_features_vector = vector_assembler.transform(housing_df)
df_with_features_vector.show(3)


+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|            features|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+
only showing top 3 rows



In [64]:
train_df, test_df = df_with_features_vector.randomSplit([0.8, 0.2], seed=17)


In [65]:
from pyspark.ml.regression import LinearRegression

linear = LinearRegression(featuresCol="features", labelCol="medv")
linear_model = linear.fit(train_df)


In [66]:
predictions_df = linear_model.transform(test_df)
predictions_df.show(3)


+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+------------------+
| ID|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio| black|lstat|medv|            features|        prediction|
+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+------------------+
|  7|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|[0.08829,12.5,7.8...|21.273530243958177|
| 24|0.98843| 0.0| 8.14|   0|0.538|5.813|100.0|4.0952|  4|307|   21.0|394.54|19.88|14.5|[0.98843,0.0,8.14...|13.894245541490553|
| 44|0.15936| 0.0| 6.91|   0|0.448|6.211|  6.5|5.7209|  3|233|   17.9|394.46| 7.44|24.7|[0.15936,0.0,6.91...|25.209683694484067|
+---+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+------------------+
only showing top 3 rows



In [67]:
linear_model.summary.r2


0.7086992014223543

In [68]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="r2")
evaluator.evaluate(predictions_df)


0.6880637937295275

In [130]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

linear = LinearRegression(featuresCol="features", labelCol="medv")
pipeline = Pipeline(stages=[vector_assembler, linear])
param_grid = ParamGridBuilder() \
    .addGrid(linear.elasticNetParam, [0.01, 0.02, 0.05]) \
    .addGrid(linear.solver, ['normal', 'l-bfgs']) \
    .addGrid(linear.regParam, [0.4, 0.5, 0.6]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=evaluator,
                          numFolds=10)

optimized_model = crossval.fit(housing_df)


In [128]:
[(k.name, v) for (k, v) in optimized_model.bestModel.stages[1].extractParamMap().items()]


[('epsilon', 1.35),
 ('featuresCol', 'features'),
 ('predictionCol', 'prediction'),
 ('loss', 'squaredError'),
 ('elasticNetParam', 0.02),
 ('regParam', 0.6),
 ('maxIter', 100),
 ('labelCol', 'medv'),
 ('tol', 1e-06),
 ('standardization', True),
 ('aggregationDepth', 2),
 ('fitIntercept', True),
 ('solver', 'l-bfgs')]

In [115]:
optimized_model.avgMetrics


[0.60228046689935,
 0.6022857524897973,
 0.6023834177393386,
 0.6023887410814059,
 0.6026432049338516,
 0.6026481973755449,
 0.6030001230672197,
 0.6030052621812348,
 0.6031040135770328,
 0.6031091439263058,
 0.6032064249778748,
 0.6032086198971643,
 0.60355796933488,
 0.6035626800336215,
 0.6036587578463659,
 0.603663431370495,
 0.6034106428627964,
 0.6034118340373834]

In [117]:
_, evaluation_df = housing_df.randomSplit([0.8, 0.2], seed=17)
evaluator.evaluate(optimized_model.transform(evaluation_df))


0.7202817971205354