In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
spark = SparkSession.builder.appName("Spark MLIB- Linear Regression").getOrCreate()
sparkContext = spark.sparkContext
sqlContext = SQLContext(sparkContext)

In [2]:
house_df = sqlContext.read.format('csv').options(header='true', inferschema='true').load('hdfs://nameservice1/user/edureka_672184/m8_datasets/boston.csv')
house_df.take(1)

[Row(CRIM=0.00632, ZN=18.0, INDUS=2.309999943, CHAS=0, NOX=0.537999988, RM=6.574999809, AGE=65.19999695, DIS=4.090000153, RAD=1, TAX=296, PT=15.30000019, B=396.8999939, LSTAT=4.980000019, MV=24.0)]

In [3]:
house_df.cache()
house_df.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PT: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MV: double (nullable = true)



In [4]:
house_df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
CRIM,506,3.6135235608162057,8.601545086715594,0.00632,88.97619629
ZN,506,11.363636363636363,23.32245299451514,0.0,100.0
INDUS,506,11.136778749531626,6.86035298095724,0.460000008,27.73999977
CHAS,506,0.0691699604743083,0.2539940413404101,0,1
NOX,506,0.5546950602312246,0.1158776754570543,0.38499999,0.870999992
RM,506,6.28463438896641,0.7026171549511354,3.561000109,8.779999733
AGE,506,68.57490120115612,28.148861532793276,2.900000095,100.0
DIS,506,3.7950426960059325,2.105710142043288,1.129600048,12.12650013
RAD,506,9.549407114624506,8.707259384239366,1,24


In [5]:
import pandas as pd
house_df.dtypes

[('CRIM', 'double'),
 ('ZN', 'double'),
 ('INDUS', 'double'),
 ('CHAS', 'int'),
 ('NOX', 'double'),
 ('RM', 'double'),
 ('AGE', 'double'),
 ('DIS', 'double'),
 ('RAD', 'int'),
 ('TAX', 'int'),
 ('PT', 'double'),
 ('B', 'double'),
 ('LSTAT', 'double'),
 ('MV', 'double')]

In [6]:
### [('CRIM', 'double'),
## ('ZN', 'double'),
## ('INDUS', 'double'),...]
numeric_features = [t[0] for t in house_df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = house_df.select(numeric_features).sample(False, 0.8).toPandas()
axs = pd.plotting.scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

In [7]:
import six
for i in house_df.columns:
    if not( isinstance(house_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to MV for ", i, house_df.stat.corr('MV',i))

('Correlation to MV for ', 'CRIM', -0.3883046116575088)
('Correlation to MV for ', 'ZN', 0.36044534463752903)
('Correlation to MV for ', 'INDUS', -0.48372517128143383)
('Correlation to MV for ', 'CHAS', 0.17526017775291847)
('Correlation to MV for ', 'NOX', -0.4273207763683772)
('Correlation to MV for ', 'RM', 0.695359937127267)
('Correlation to MV for ', 'AGE', -0.37695456714288667)
('Correlation to MV for ', 'DIS', 0.24992873873512172)
('Correlation to MV for ', 'RAD', -0.3816262315669168)
('Correlation to MV for ', 'TAX', -0.46853593528654536)
('Correlation to MV for ', 'PT', -0.5077867038116085)
('Correlation to MV for ', 'B', 0.3334608226834164)
('Correlation to MV for ', 'LSTAT', -0.7376627294671615)
('Correlation to MV for ', 'MV', 1.0)


In [8]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PT', 'B', 'LSTAT'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(house_df)
vhouse_df = vhouse_df.select(['features', 'MV'])
vhouse_df.show(3)

+--------------------+-----------+
|            features|         MV|
+--------------------+-----------+
|[0.00632,18.0,2.3...|       24.0|
|[0.027310001,0.0,...|21.60000038|
|[0.02729,0.0,7.07...|34.70000076|
+--------------------+-----------+
only showing top 3 rows



In [9]:
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [10]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='MV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.01084537748053102,-0.01763148650436992,2.7317158690087413,-4.200277462581627,3.7444371877737375,0.0,-0.6352353243088716,0.0,-6.043002652792997e-05,-0.8879217584739071,0.009336940898906165,-0.5700329987657504]
Intercept: 24.1517315389


In [11]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 5.157810
r2: 0.706745


In [15]:
train_df.describe().show()

+-------+-----------------+
|summary|               MV|
+-------+-----------------+
|  count|              363|
|   mean|22.57493118402479|
| stddev|9.537648860974771|
|    min|      5.599999905|
|    max|             50.0|
+-------+-----------------+



In [16]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","MV","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|30.902346665788812|32.20000076|[0.00906,90.0,2.9...|
|27.986420737632198|       22.0|[0.01096,55.0,2.2...|
| 31.71432338607955|31.60000038|[0.01432,100.0,1....|
|  27.8248357482449|       24.5|[0.01501,80.0,2.0...|
|41.886642369061576|       50.0|[0.01501,90.0,1.2...|
+------------------+-----------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.734553


In [17]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 4.26167


In [18]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()

numIterations: 11
objectiveHistory: [0.5000000000000004, 0.43261186467009544, 0.23999247735467868, 0.2173075865456097, 0.18815020383169992, 0.18399187495468977, 0.18229286932403363, 0.18174725745143952, 0.18152726901060562, 0.18127258311163563, 0.18089082293741754]
+-------------------+
|          residuals|
+-------------------+
| -7.332008677003742|
| 0.6330223115127751|
|   5.31991036381627|
| 1.8241379374316438|
| 11.675464393076226|
|-1.8967235241105058|
|    9.0053610798915|
| 3.3219600273286005|
| 1.6491622510043626|
|-3.1990019249339383|
| -1.739661902261311|
|  5.674831218091775|
|  -10.5171181243686|
| 3.3433839325287877|
| -4.020999382533585|
| 1.4537677378356086|
| -2.153397870794734|
|   0.67060561730937|
| -1.620904624341776|
| -3.030380618582072|
+-------------------+
only showing top 20 rows



In [20]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","MV","features").show()

+------------------+-----------+--------------------+
|        prediction|         MV|            features|
+------------------+-----------+--------------------+
|30.902346665788812|32.20000076|[0.00906,90.0,2.9...|
|27.986420737632198|       22.0|[0.01096,55.0,2.2...|
| 31.71432338607955|31.60000038|[0.01432,100.0,1....|
|  27.8248357482449|       24.5|[0.01501,80.0,2.0...|
|41.886642369061576|       50.0|[0.01501,90.0,1.2...|
|25.865723635719483|       33.0|[0.019509999,17.5...|
|20.569518270574232|20.10000038|[0.019649999,80.0...|
| 39.15759107289478|       50.0|[0.020090001,95.0...|
|31.515759104815615|31.10000038|[0.02187,60.0,2.9...|
|28.043377930137126|23.89999962|[0.025429999,55.0...|
|26.507824780885468|28.70000076|[0.029850001,0.0,...|
|29.625675270711486|31.20000076|[0.03049,55.0,3.7...|
|22.425945225972203|20.60000038|[0.033059999,0.0,...|
|20.720155196313154|       19.5|[0.03427,0.0,5.19...|
| 24.84416586933454|19.39999962|[0.03466,35.0,6.0...|
|23.289942001320455|20.89999