In [17]:
from pyspark.sql import SparkSession
import pandas as pd

In [64]:
#SparkSession: it initiates a Spark Application which all the code for that Session will run on
spark = SparkSession.builder.master("local[*]").appName('boston_housing').getOrCreate() 
df = spark.read.csv('C:\\Users\\Sampayo\\Documents\\Data Science\\Aprendizaje\\Spark\\boston_data.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: double (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: double (nullable = true)
 |-- tax: double (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- black: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [62]:
df.toPandas().head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.15876,0.0,10.81,0.0,0.413,5.961,17.5,5.2873,4.0,305.0,19.2,376.94,9.88,21.7
1,0.10328,25.0,5.13,0.0,0.453,5.927,47.2,6.932,8.0,284.0,19.7,396.9,9.22,19.6
2,0.3494,0.0,9.9,0.0,0.544,5.972,76.7,3.1025,4.0,304.0,18.4,396.24,9.97,20.3
3,2.73397,0.0,19.58,0.0,0.871,5.597,94.9,1.5257,5.0,403.0,14.7,351.85,21.45,15.4
4,0.04337,21.0,5.64,0.0,0.439,6.115,63.0,6.8147,4.0,243.0,16.8,393.97,9.43,20.5


#### EDA

In [12]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
crim,404,3.7309119306930723,8.943922212251913,0.00632,88.9762
zn,404,10.509900990099009,22.053733184762923,0.0,95.0
indus,404,11.189900990099002,6.8149093223650885,0.46,27.74
chas,404,0.06930693069306931,0.25429026389960196,0.0,1.0
nox,404,0.5567103960396043,0.11732064984156548,0.392,0.871
rm,404,6.301450495049499,0.6758302935149543,3.561,8.78
age,404,68.60173267326732,28.066142579151702,2.9,100.0
dis,404,3.7996663366336647,2.1099159643057357,1.1691,12.1265
rad,404,9.836633663366337,8.834741064787444,1.0,24.0


**Scatter matrix**

In [20]:
from pandas.plotting import scatter_matrix

numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df.select(numeric_features).sample(False, 0.8).toPandas()
axs = scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

**Correlation betweem independent variables**

In [34]:
correlations = []
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to medv for ", i, df.stat.corr('medv',i))
        correlations.append(i)

Correlation to medv for  crim -0.4009558757372438
Correlation to medv for  zn 0.355607582415516
Correlation to medv for  indus -0.5016982293419979
Correlation to medv for  chas 0.14140044808241922
Correlation to medv for  nox -0.4392251926056786
Correlation to medv for  rm 0.6835409939262136
Correlation to medv for  age -0.39086335148339485
Correlation to medv for  dis 0.26487595153417776
Correlation to medv for  rad -0.4235083975722877
Correlation to medv for  tax -0.49579240671703434
Correlation to medv for  ptratio -0.5063125552383506
Correlation to medv for  black 0.36007109188975617
Correlation to medv for  lstat -0.7426954940642168
Correlation to medv for  medv 1.0


### Data for ML

In [45]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = df.drop('medv').columns, outputCol = 'features')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['features', 'medv'])
vhouse_df.show(3)

+--------------------+----+
|            features|medv|
+--------------------+----+
|[0.15876,0.0,10.8...|21.7|
|[0.10328,25.0,5.1...|19.6|
|[0.3494,0.0,9.9,0...|20.3|
+--------------------+----+
only showing top 3 rows



In [46]:
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

## Linear Regression

In [47]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='medv', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)

print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-0.07298383843559936,0.004958887136624527,0.0,1.2545111061871645,-6.76297293683956,3.9602978726384244,0.0,-0.6972767455989888,0.0,-0.0003187825383137368,-0.9105212377598977,0.006237011410484243,-0.5796096511010128]
Intercept: 25.99962730510135


In [48]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 4.669193
r2: 0.734383


In [49]:
train_df.describe().show()

+-------+------------------+
|summary|              medv|
+-------+------------------+
|  count|               289|
|   mean|22.562283737024227|
| stddev| 9.075416827880105|
|    min|               5.0|
|    max|              50.0|
+-------+------------------+



In [51]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","medv","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="medv",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+----+--------------------+
|        prediction|medv|            features|
+------------------+----+--------------------+
|31.200722396050622|24.0|[0.00632,18.0,2.3...|
|25.756185891609718|33.0|[0.01951,17.5,1.3...|
| 25.79842127714608|24.7|[0.02055,85.0,0.7...|
|31.426263267889937|31.1|[0.02187,60.0,2.9...|
|29.316425926071894|31.2|[0.03049,55.0,3.7...|
+------------------+----+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.7261


In [52]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 4.27895


In [53]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 11
objectiveHistory: [0.5000000000000004, 0.43334024729089343, 0.22994298647889588, 0.20602868614269995, 0.17630536368576855, 0.1735838776652062, 0.1725997076998319, 0.17107341541890883, 0.17080250854705453, 0.17071536354524092, 0.17067042664345536]
+-------------------+
|          residuals|
+-------------------+
| 0.6715148415437469|
| 2.5521504993014084|
| 11.665215459364852|
|-1.8233660969671845|
|-2.8101640588427017|
|  9.549554504963872|
|  3.987616818563062|
|  2.025257771275985|
|-2.6233374353960386|
| 10.785927448910023|
|  5.765740479056099|
| -10.33191621365323|
|-3.7892749490785036|
| 3.0530669473396372|
|-4.0848433376023365|
| 1.7521587997670913|
| -2.254602668549264|
| 0.9819308183697046|
|  2.221460848405556|
|-1.3230441047817152|
+-------------------+
only showing top 20 rows



In [55]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","medv","features").show()

+------------------+----+--------------------+
|        prediction|medv|            features|
+------------------+----+--------------------+
|31.200722396050622|24.0|[0.00632,18.0,2.3...|
|25.756185891609718|33.0|[0.01951,17.5,1.3...|
| 25.79842127714608|24.7|[0.02055,85.0,0.7...|
|31.426263267889937|31.1|[0.02187,60.0,2.9...|
|29.316425926071894|31.2|[0.03049,55.0,3.7...|
|22.228608330022457|20.6|[0.03306,0.0,5.19...|
|32.102598217379985|34.9|[0.03359,75.0,2.9...|
|24.591432063843904|22.9|[0.03551,25.0,4.8...|
|27.307749647738873|22.0|[0.03932,0.0,3.41...|
|35.956680093570796|33.3|[0.04011,80.0,1.5...|
|16.229825503145317|18.2|[0.04301,80.0,1.9...|
|26.580031305265933|23.9|[0.04462,25.0,4.8...|
| 32.35208239689402|30.3|[0.04666,80.0,1.5...|
|  24.3484578114204|23.4|[0.04981,21.0,5.6...|
| 21.22576746936481|19.0|[0.05497,0.0,5.19...|
|23.067508979817312|18.9|[0.06417,0.0,5.96...|
|23.568528025338978|19.3|[0.06617,0.0,3.24...|
|31.178005836271048|30.5|[0.06911,45.0,3.4...|
|25.291344518

## Decision Tree Regression

In [56]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'medv')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="medv", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 4.6038


**Feature importance**

In [57]:
dt_model.featureImportances

SparseVector(13, {0: 0.0347, 4: 0.0061, 5: 0.2305, 6: 0.0552, 7: 0.0791, 8: 0.0039, 9: 0.0315, 10: 0.0044, 11: 0.016, 12: 0.5386})

In [58]:
df.take(1)

[Row(crim=0.15876, zn=0.0, indus=10.81, chas=0.0, nox=0.413, rm=5.961, age=17.5, dis=5.2873, rad=4.0, tax=305.0, ptratio=19.2, black=376.94, lstat=9.88, medv=21.7)]

rm=5.961 :  number of rooms is the most important feature to predict the house median price in our data.

### Gradient-boosted tree regression

In [59]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol = 'features', labelCol = 'medv', maxIter=10)
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'medv', 'features').show(5)

+------------------+----+--------------------+
|        prediction|medv|            features|
+------------------+----+--------------------+
|23.325449428247815|24.0|[0.00632,18.0,2.3...|
| 36.54787760193538|33.0|[0.01951,17.5,1.3...|
|23.168102752962593|24.7|[0.02055,85.0,0.7...|
| 23.82074631959407|31.1|[0.02187,60.0,2.9...|
|29.917032416902007|31.2|[0.03049,55.0,3.7...|
+------------------+----+--------------------+
only showing top 5 rows



In [60]:
gbt_evaluator = RegressionEvaluator(
    labelCol="medv", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 4.44979
