In [1]:
from pyspark.ml.regression import LinearRegression as LR
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql import SparkSession

spark = SparkSession.builder \
      .appName("Boston Price Prediction") \
      .config("spark.executor.memory", "70g") \
      .config("spark.driver.memory", "50g") \
      .config("spark.memory.offHeap.enabled",True) \
      .config("spark.memory.offHeap.size","16g") \
      .getOrCreate()

In [2]:
house_df = spark.read.format("csv"). \
            options(header="true", inferschema="true"). \
            load("boston/train.csv")

In [3]:
house_df.show(3)

+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
| ID|   crim|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|  1|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|  2|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|  4|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
+---+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 3 rows



In [4]:
# DataFrame Schema
house_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- black: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [5]:
house_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
ID,333,250.95195195195194,147.8594378018597,1,506
crim,333,3.3603414714714708,7.352271836781104,0.00632,73.5341
zn,333,10.68918918918919,22.674761796618217,0.0,100.0
indus,333,11.29348348348346,6.998123104477312,0.74,27.74
chas,333,0.06006006006006006,0.2379556428164483,0,1
nox,333,0.557144144144145,0.11495450830289312,0.385,0.871
rm,333,6.265618618618616,0.7039515757334471,3.561,8.725
age,333,68.22642642642641,28.13334360562338,6.0,100.0
dis,333,3.7099336336336335,1.9811230514407001,1.1296,10.7103


In [6]:
vectors = VectorAssembler(inputCols = ['crim', 'zn','indus','chas',
                                       'nox','rm','age','dis', 'rad', 'tax',
                                      'ptratio','black', 'lstat'],
                         outputCol = 'features')

vhouse_df = vectors.transform(house_df)
vhouse_df = vhouse_df.select(['features', 'medv'])
vhouse_df.show(5)

+--------------------+----+
|            features|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.08829,12.5,7.8...|22.9|
+--------------------+----+
only showing top 5 rows



In [7]:
train_df, test_df = vhouse_df.randomSplit([0.7,0.3])

In [8]:
regressor = LR(featuresCol = 'features', labelCol='medv',\
               maxIter=20, regParam=0.3, elasticNetParam=0.8)

model = regressor.fit(train_df)

In [9]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

Coefficients: [-0.010279413081980417,0.034113414577108085,0.0,5.6415385374198,-7.783264348644399,3.085680504353533,0.0,-0.8290283633263736,0.016467345168122184,0.0,-0.5849152858717687,0.009195354138663316,-0.5627105522578837]
Intercept: 24.28872820161242


In [10]:
modelSummary = model.summary
print("RMSE is {} and r2 is {}".format(modelSummary.rootMeanSquaredError, modelSummary.r2))
print("Number of Iterations is ",modelSummary.totalIterations)

RMSE is 4.735492439102997 and r2 is 0.7060968639352195
Number of Iterations is  21


In [11]:
model_predictions = model.transform(test_df)
model_predictions.select("features", "medv", "prediction").show(5)

+--------------------+----+------------------+
|            features|medv|        prediction|
+--------------------+----+------------------+
|[0.00906,90.0,2.9...|32.2| 30.33845691985376|
|[0.01311,90.0,1.2...|35.4| 29.92694704599407|
|[0.01501,80.0,2.0...|24.5| 27.53450962471756|
|[0.01951,17.5,1.3...|33.0|24.177663554797803|
|[0.01965,80.0,1.7...|20.1|20.945922220637236|
+--------------------+----+------------------+
only showing top 5 rows



In [13]:
model_evaluator = RegressionEvaluator(predictionCol="prediction",
                  labelCol="medv", metricName="r2")
print("R2 value on test dataset is: ", model_evaluator.evaluate(model_predictions))
print("RMSE value is", model.evaluate(test_df).rootMeanSquaredError)

R2 value on test dataset is:  0.6849428292603987
RMSE value is 5.557545888924286


In [14]:
spark.stop()