# Spark - Linear Regression

**Imports**

In [11]:
import findspark
findspark.init('/home/sedat/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

**Create SparkSession and read data file**

In [3]:
spark = SparkSession.builder.appName('linear_regression').getOrCreate()

In [9]:
train = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

23/03/19 13:25:56 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [6]:
train.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

**Linear Regression Model**

In [12]:
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

In [13]:
lrModel = lr.fit(train)

23/03/19 13:31:07 WARN Instrumentation: [293ebc17] regParam is zero, which might cause numerical instability and overfitting.
23/03/19 13:31:08 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


**Coefficients and intercept**

In [15]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [17]:
lrModel.intercept

0.14228558260358093

**Model summary and error metrics**

In [18]:
lr_summary = lrModel.summary

In [22]:
# root mean squared error
lr_summary.rootMeanSquaredError

10.16309157133015

In [23]:
# mean absolute error
lr_summary.meanAbsoluteError

8.145215527783876

In [25]:
# mean squared error
lr_summary.meanSquaredError

103.28843028724194

**Split data as train and test**

In [26]:
all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

23/03/19 13:40:44 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [27]:
train_data, test_data = all_data.randomSplit([0.7, 0.3])

**Train data**

In [31]:
train_data.count()

351

In [32]:
train_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
|-19.402336030214553|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
|-17.065399625876015|(10,[0,1,2,3,4,5,...|
| -16.71909683360509|(10,[0,1,2,3,4,5,...|
|-16.692207021311106|(10,[0,1,2,3,4,5,...|
| -16.08565904102149|(10,[0,1,2,3,4,5,...|
| -15.86200932757056|(10,[0,1,2,3,4,5,...|
+----------

In [34]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                351|
|   mean| 0.1411043221640589|
| stddev|  9.980844476793042|
|    min|-28.571478869743427|
|    max| 24.290551295953957|
+-------+-------------------+



**Test data**

In [35]:
test_data.count()

150

In [36]:
test_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
| -19.66731861537172|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
|-17.803626188664516|(10,[0,1,2,3,4,5,...|
|-17.428674570939506|(10,[0,1,2,3,4,5,...|
|-17.026492264209548|(10,[0,1,2,3,4,5,...|
| -16.26143027545273|(10,[0,1,2,3,4,5,...|
|-16.151349351277112|(10,[0,1,2,3,4,5,...|
|-15.951512565794573|(10,[0,1,2,3,4,5,...|
|-15.732088272239245|(10,[0,1,2,3,4,5,...|
| -15.72351561304857|(10,[0,1,2,3,4,5,...|
|-15.437384793431217|(10,[0,1,2,3,4,5,...|
|-15.056482974542433|(10,[0,1,2,3,4,5,...|
|-13.867087895158768|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -13.15333560636553|(10,[0,1,2,3,4,5,...|
|-12.977848725392104|(10,[0,1,2,3,4,5,...|
+----------

In [39]:
test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               150|
|   mean|0.5278245522673731|
| stddev|11.096770562387478|
|    min|-23.51088409032297|
|    max| 27.78383192005107|
+-------+------------------+



**Correct Model (Train with train data and test with test data)**

In [40]:
correct_model = lr.fit(train_data)

23/03/19 13:48:32 WARN Instrumentation: [f70fd4ce] regParam is zero, which might cause numerical instability and overfitting.


In [41]:
test_results = correct_model.evaluate(test_data)

**Residuals and errors**

In [45]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-24.379209333686195|
|-22.008705040808263|
|-19.163066707860438|
| -19.81812380988793|
|-15.821506413690344|
|-20.284699092752643|
|  -17.6632884792067|
|-20.184449184477593|
|-16.545721071537848|
| -16.65241763275372|
|  -16.8386824122115|
|-16.455017916405748|
| -17.28502219930265|
|-14.911870966684404|
|-14.840454741550595|
|-14.091652992503452|
|-15.229721715285837|
| -16.00643167652115|
|-10.920333067582344|
|-13.600823182736644|
+-------------------+
only showing top 20 rows



In [49]:
# root mean squared error
test_results.rootMeanSquaredError

11.105861278146836

In [51]:
# mean squared error
test_results.meanSquaredError

123.34015472944128

In [53]:
# mean absolute error
test_results.meanAbsoluteError

9.049615640199885