### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1577630440276)
SparkSession available as 'spark'


2019-12-29 20:10:50 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6951b939


### Initializing Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML

In [3]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder,TrainValidationSplit}

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}


### Prepare training and test data

In [4]:
val data = spark.read.format("libsvm").load("sample_linear_regression_data.txt")

data: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [5]:
val Array(train_data,test_data) = data.randomSplit(Array(0.9,0.1), seed=12345)

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]


In [6]:
data.show(3)

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|-9.490009878824548|(10,[0,1,2,3,4,5,...|
|0.2577820163584905|(10,[0,1,2,3,4,5,...|
|-4.438869807456516|(10,[0,1,2,3,4,5,...|
+------------------+--------------------+
only showing top 3 rows



In [7]:
train_data.show(3)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-26.805483428483072|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 3 rows



In [8]:
test_data.show(3)

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|
|-12.977848725392104|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 3 rows



In [9]:
data.count

res4: Long = 501


In [10]:
train_data.count

res5: Long = 447


In [11]:
test_data.count

res6: Long = 54


In [12]:
data.printSchema

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [13]:
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_d9a56d0faf61


**We use a `ParamGridBuilder` to construct a grid of parameters to search over.**
<br>**`TrainValidationSplit` will try all combinations of values and determine best model using the evaluator**

In [14]:
val paramGrid = new ParamGridBuilder().addGrid(lr.regParam, Array(0.1,0.01)).addGrid(lr.fitIntercept)
                                      .addGrid(lr.elasticNetParam, Array(0.0,0.5,1.0)).build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_d9a56d0faf61-elasticNetParam: 0.0,
	linReg_d9a56d0faf61-fitIntercept: true,
	linReg_d9a56d0faf61-regParam: 0.1
}, {
	linReg_d9a56d0faf61-elasticNetParam: 0.5,
	linReg_d9a56d0faf61-fitIntercept: true,
	linReg_d9a56d0faf61-regParam: 0.1
}, {
	linReg_d9a56d0faf61-elasticNetParam: 1.0,
	linReg_d9a56d0faf61-fitIntercept: true,
	linReg_d9a56d0faf61-regParam: 0.1
}, {
	linReg_d9a56d0faf61-elasticNetParam: 0.0,
	linReg_d9a56d0faf61-fitIntercept: true,
	linReg_d9a56d0faf61-regParam: 0.01
}, {
	linReg_d9a56d0faf61-elasticNetParam: 0.5,
	linReg_d9a56d0faf61-fitIntercept: true,
	linReg_d9a56d0faf61-regParam: 0.01
}, {
	linReg_d9a56d0faf61-elasticNetParam: 1.0,
	linReg_d9a56d0faf61-fitIntercept: true,
	lin...

**In this case the estimator is simply the linear regression.**

**A TrainValidationSplit requires an `Estimator`, a set of `Estimator ParamMaps`, and an `Evaluator` 80% of the data will be used for training and the remaining 20% for validation.**

In [15]:
val trainValidationSplit = new TrainValidationSplit().setEstimator(lr).setEvaluator(new RegressionEvaluator)
                                                     .setEstimatorParamMaps(paramGrid).setTrainRatio(0.8)

trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_0c0e15ee7323


### Run train validation split, and choose the best set of parameters.

In [16]:
val model = trainValidationSplit.fit(train_data)

2019-12-29 20:25:48 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-29 20:25:48 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2019-12-29 20:25:48 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2019-12-29 20:25:48 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


model: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_0c0e15ee7323


### Make predictions on test data. model is the model with combination of parameters that performed best

In [19]:
model.transform(test_data).select("features","label","prediction").show(5)

+--------------------+-------------------+--------------------+
|            features|              label|          prediction|
+--------------------+-------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -23.51088409032297| -1.6659388625179559|
|(10,[0,1,2,3,4,5,...|-21.432387764165806|  0.3400877302576284|
|(10,[0,1,2,3,4,5,...|-12.977848725392104|-0.02335359093652395|
|(10,[0,1,2,3,4,5,...|-11.827072996392571|  2.5642684021108417|
|(10,[0,1,2,3,4,5,...|-10.945919657782932| -0.1631314487734783|
+--------------------+-------------------+--------------------+
only showing top 5 rows



### Closing Spark Session

In [20]:
spark.stop()

## Thank You!