# Linear Regression Example

___

### Start simple Spark Session

In [1]:
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://DESKTOP-0D68992:4040
SparkContext available as 'sc' (version = 2.4.0, master = local[*], app id = local-1577164891872)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession


In [2]:
import org.apache.spark.ml.regression.LinearRegression

import org.apache.spark.ml.regression.LinearRegression


In [3]:
val spark = SparkSession.builder().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2f217afe


### Read the file which is in "libsvm" format

In [4]:
val df = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

2019-12-24 10:51:55 WARN  LibSVMFileFormat:66 - 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


df: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [5]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
df.show(5,false)

+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label              |features                                                                                                                                                                                                                             |
+-------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|-9.490009878824548 |(10,[0,1,2,3,4,5,6,7,8,9],[0.4551273600657362,0.36644694351969087,-0.38256108933468047,-0.4458430198517267,0.33109790358914726,0.8067445293443565,-0.2624341731773887,-0.44850386111659524,-0.07269284838169332,0.5658035575800

### Create new LinearRegression Object

In [7]:
val lr = new LinearRegression().setMaxIter(100).setRegParam(0.3).setElasticNetParam(0.8)

lr: org.apache.spark.ml.regression.LinearRegression = linReg_22a5a00b84f4


### Fit the model

In [8]:
val lrModel = lr.fit(df)

2019-12-24 10:52:17 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-24 10:52:17 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_22a5a00b84f4


### Print the coefficients and intercept for linear regression

In [9]:
println(s"Coeffeicents: ${lrModel.coefficients}   Intercept: {lrModel.intercept}")

Coeffeicents: [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426]   Intercept: {lrModel.intercept}


### Summarize the model over the training set and print out some metrics

In [10]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@2ae33eff


In [11]:
println(s"numIterations: ${trainingSummary.totalIterations}")

numIterations: 7


In [12]:
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")

objectiveHistory: List(0.49999999999999994, 0.4967620357443381, 0.4936361664340463, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114)


In [14]:
trainingSummary.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| -9.889232683103197|
| 0.5533794340053554|
| -5.204019455758823|
|-20.566686715507508|
|   -9.4497405180564|
+-------------------+
only showing top 5 rows



In [17]:
println(s"MSE: ${trainingSummary.meanSquaredError}")

MSE: 103.81729352727655


In [18]:
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

RMSE: 10.189077167598475


In [21]:
println(s"R2: ${trainingSummary.r2}")

R2: 0.022861466913958184


### Closing Spark Session

In [22]:
spark.stop()

## Thank You!