In [1]:
from os.path import expanduser
import boto3

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter3/'


In [2]:
from pyspark.context import SparkContext

sc = SparkContext('local', 'test')


In [3]:
from pyspark.sql import SQLContext

sql = SQLContext(sc)

housing_df = sql.read.csv(SRC_PATH + 'train.csv', header=True, inferSchema=True)

reduced_housing_df = housing_df.select(['crim', 'zn', 'indus', 'medv'])


In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

training_features = ['crim', 'zn', 'indus']

vector_assembler = VectorAssembler(inputCols=training_features, outputCol="features")
linear = LinearRegression(featuresCol="features", labelCol="medv")
pipeline = Pipeline(stages=[vector_assembler, linear])

model = pipeline.fit(reduced_housing_df)

In [5]:
!rm -rf /tmp/linear-model

In [6]:
model.save("file:///tmp/linear-model")

In [7]:
!ls -R /tmp/linear-model 

/tmp/linear-model:
metadata  stages

/tmp/linear-model/metadata:
part-00000  _SUCCESS

/tmp/linear-model/stages:
0_VectorAssembler_45a8b08815826d4dcad9	1_LinearRegression_4e0aaf0c9b20a556cff4

/tmp/linear-model/stages/0_VectorAssembler_45a8b08815826d4dcad9:
metadata

/tmp/linear-model/stages/0_VectorAssembler_45a8b08815826d4dcad9/metadata:
part-00000  _SUCCESS

/tmp/linear-model/stages/1_LinearRegression_4e0aaf0c9b20a556cff4:
data  metadata

/tmp/linear-model/stages/1_LinearRegression_4e0aaf0c9b20a556cff4/data:
part-00000-ca3dd5f1-ad9a-41f2-865e-86e0e0a0d8fa-c000.snappy.parquet  _SUCCESS

/tmp/linear-model/stages/1_LinearRegression_4e0aaf0c9b20a556cff4/metadata:
part-00000  _SUCCESS


In [8]:
from pyspark.ml import PipelineModel

loaded_model = PipelineModel.load('/tmp/linear-model')

In [9]:
reduced_housing_df.limit(3).show()


+-------+----+-----+----+
|   crim|  zn|indus|medv|
+-------+----+-----+----+
|0.00632|18.0| 2.31|24.0|
|0.02731| 0.0| 7.07|21.6|
|0.03237| 0.0| 2.18|33.4|
+-------+----+-----+----+



In [10]:
loaded_model.transform(reduced_housing_df.limit(3)).show()

+-------+----+-----+----+-------------------+------------------+
|   crim|  zn|indus|medv|           features|        prediction|
+-------+----+-----+----+-------------------+------------------+
|0.00632|18.0| 2.31|24.0|[0.00632,18.0,2.31]|27.714445239256854|
|0.02731| 0.0| 7.07|21.6| [0.02731,0.0,7.07]|24.859566163416336|
|0.03237| 0.0| 2.18|33.4| [0.03237,0.0,2.18]| 26.74953947801712|
+-------+----+-----+----+-------------------+------------------+



In [11]:
reduced_housing_df.schema

StructType(List(StructField(crim,DoubleType,true),StructField(zn,DoubleType,true),StructField(indus,DoubleType,true),StructField(medv,DoubleType,true)))

In [12]:
from pyspark.sql.types import *
schema = StructType([StructField('crim', DoubleType(), True),
                     StructField('zn', DoubleType(), True),
                     StructField('indus', DoubleType(), True)])

In [13]:
from pyspark.sql import Row
predict_df = sql.createDataFrame([Row(crim=0.00632, zn=18.0, indus=2.31)],schema=schema) 

In [16]:
predict_df.show()

+-------+----+-----+
|   crim|  zn|indus|
+-------+----+-----+
|0.00632|18.0| 2.31|
+-------+----+-----+



In [14]:
loaded_model.transform(predict_df).show()

+-------+----+-----+-------------------+------------------+
|   crim|  zn|indus|           features|        prediction|
+-------+----+-----+-------------------+------------------+
|0.00632|18.0| 2.31|[0.00632,18.0,2.31]|27.714445239256854|
+-------+----+-----+-------------------+------------------+



In [15]:
loaded_model.transform(predict_df).collect()[0].prediction

27.714445239256854