### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://Varun-CK:4040
SparkContext available as 'sc' (version = 2.3.0, master = local[*], app id = local-1577635298671)
SparkSession available as 'spark'


2019-12-29 21:31:52 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6951b939


### Initializing Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML

In [3]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder,TrainValidationSplit}

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}


### Preparing training and test data

In [4]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("USA_Housing.csv")

data: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 4 more fields]


In [5]:
data.printSchema

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



### See an example of what the data looks like

In [6]:
val colnames = data.columns
val firstRow = data.head(1)(0)

colnames: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price)
firstRow: org.apache.spark.sql.Row = [79545.45857431678,5.682861321615587,7.009188142792237,4.09,23086.800502686456,1059033.5578701235]


In [7]:
for (i <- Range(0,colnames.size)){
    println(s"Column Name: ${colnames(i)}")
    println(s"Column Data: ${firstRow(i)}")
    println()
}

Column Name: Avg Area Income
Column Data: 79545.45857431678

Column Name: Avg Area House Age
Column Data: 5.682861321615587

Column Name: Avg Area Number of Rooms
Column Data: 7.009188142792237

Column Name: Avg Area Number of Bedrooms
Column Data: 4.09

Column Name: Area Population
Column Data: 23086.800502686456

Column Name: Price
Column Data: 1059033.5578701235



## Setting Up DataFrame for Machine Learning

### imports

In [8]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.linalg.Vectors


### Rename label column

In [9]:
val df = data.select(data("Price").as("label"),$"Avg Area Income",$"Avg Area House Age",$"Avg Area Number of Rooms"
                     ,$"Avg Area Number of Bedrooms",$"Area Population")

df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 4 more fields]


<code>
An assembler converts the input values to a vector
A vector is what the ML algorithm reads to train a model
</code>

### Set the input columns from which we are supposed to read the values, Set the name of the column where the vector will be stored

In [10]:
import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.feature.VectorAssembler


In [11]:
df.columns

res3: Array[String] = Array(label, Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population)


In [12]:
val assembler = new VectorAssembler().setInputCols(Array("Avg Area Income", "Avg Area House Age", "Avg Area Number of Rooms"
                                                   , "Avg Area Number of Bedrooms", "Area Population")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_ba4d25ed0c99


### Transform the DataFrame

In [13]:
val output = assembler.transform(df).select("label","features")

output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


### Create an array of the training and test data

In [14]:
val Array(training,testing) = output.randomSplit(Array(0.7,0.3), seed=12345)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testing: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]


In [15]:
output.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              5000|
|   mean|  1232072.65414236|
| stddev|353117.62658106035|
|    min|15938.657923287848|
|    max|2469065.5941747027|
+-------+------------------+



In [16]:
training.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              3481|
|   mean|1237811.1892187502|
| stddev| 354963.1535248248|
|    min|15938.657923287848|
|    max|2370231.3201015536|
+-------+------------------+



In [17]:
testing.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              1519|
|   mean|1218922.0020021829|
| stddev|348611.31160445284|
|    min| 151527.0826265551|
|    max|2469065.5941747027|
+-------+------------------+



## LINEAR REGRESSION

In [18]:
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_d41f9260b8b4


## PARAMETER GRID BUILDER

In [19]:
val paramGrid = new ParamGridBuilder().addGrid(lr.regParam,Array(1000,0.001)).build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_d41f9260b8b4-regParam: 1000.0
}, {
	linReg_d41f9260b8b4-regParam: 0.001
})


## TRAIN TEST SPLIT

In this case the estimator is simply the `linear regression`.
<br>A TrainValidationSplit requires an `Estimator`, a set of `Estimator ParamMaps`, and an `Evaluator`.
<br>`80%` of the data will be used for training and the remaining `20%` for validation.

In [20]:
val trainValidationSplit = (new TrainValidationSplit().setEstimator(lr)
                                                      .setEvaluator(new RegressionEvaluator().setMetricName("r2"))
                                                      .setEstimatorParamMaps(paramGrid)
                                                      .setTrainRatio(0.8))

trainValidationSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_c73b0dfec311


**We can then treat this object as the new model and use fit on it.**

In [21]:
val model = trainValidationSplit.fit(training)

2019-12-29 21:54:03 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-29 21:54:03 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2019-12-29 21:54:03 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2019-12-29 21:54:03 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


model: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_c73b0dfec311


## EVALUATION USING THE TEST DATA

### Make predictions on test data. model is the model with combination of parameters that performed best.

In [22]:
model.transform(testing).select("features","label","prediction").show(5)

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[48735.9245124086...| 151527.0826265551|  375881.466199561|
|[40366.6162912572...|152071.87474956046|221510.06829972798|
|[50926.7766338627...|211017.97049475575| 473496.9527376839|
|[62173.5800990082...| 231189.8209898588| 301314.0310567827|
|[55048.3320238013...| 288708.9121479006|401393.64051124407|
+--------------------+------------------+------------------+
only showing top 5 rows



### Check out the metrics

In [23]:
model.validationMetrics

res8: Array[Double] = Array(0.9278218053023559, 0.9277540425917379)


### Closing Spark Session

In [24]:
spark.stop()

## Thank You!