# Predicting the price of an house using Linear Regression Model and using One Hot Encoder for categorical columns

### Initialize and create a spark session

In [2]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("homeprice").getOrCreate()

2019-12-27 12:41:10 WARN  SparkSession$Builder:66 - Using an existing SparkSession; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7ef61506


### Initialize Logger

In [3]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML for Linear Regression

In [4]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


### Using Spark to read in the homeprices csv file

In [5]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("homeprices.csv")

data: org.apache.spark.sql.DataFrame = [town: string, area: int ... 1 more field]


### Print the count of the Dataframe

In [6]:
data.count()

res1: Long = 13


### Print the count of the Dataframe by dropping duplicates

In [7]:
data.na.drop().count()

res2: Long = 13


### Printing the first 5 rows of the dataframe

In [8]:
for (row <- data.head(5)){
    println(row)
}

[monroe township,2600,550000]
[monroe township,3000,565000]
[monroe township,3200,610000]
[monroe township,3600,680000]
[monroe township,4000,725000]


### Show

In [9]:
data.show()

+---------------+----+------+
|           town|area| price|
+---------------+----+------+
|monroe township|2600|550000|
|monroe township|3000|565000|
|monroe township|3200|610000|
|monroe township|3600|680000|
|monroe township|4000|725000|
|   west windsor|2600|585000|
|   west windsor|2800|615000|
|   west windsor|3300|650000|
|   west windsor|3600|710000|
|    robinsville|2600|575000|
|    robinsville|2900|600000|
|    robinsville|3100|620000|
|    robinsville|3600|695000|
+---------------+----+------+



### Schema

In [10]:
data.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)



In [11]:
data.groupBy("town").count().show()

+---------------+-----+
|           town|count|
+---------------+-----+
|monroe township|    5|
|   west windsor|    4|
|    robinsville|    4|
+---------------+-----+



### Converting the categotical column `town` from String type to Vector form

#### Using String Indexer to convert categorical string columns to numerical type

In [12]:
val indexer = new StringIndexer().setInputCol("town").setOutputCol("townInd")

indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_64cc5c9186bb


In [13]:
val indexer_model = indexer.fit(data)

indexer_model: org.apache.spark.ml.feature.StringIndexerModel = strIdx_64cc5c9186bb


In [14]:
val indexer_df = indexer_model.transform(data)

indexer_df: org.apache.spark.sql.DataFrame = [town: string, area: int ... 2 more fields]


In [15]:
indexer_df.show()

+---------------+----+------+-------+
|           town|area| price|townInd|
+---------------+----+------+-------+
|monroe township|2600|550000|    0.0|
|monroe township|3000|565000|    0.0|
|monroe township|3200|610000|    0.0|
|monroe township|3600|680000|    0.0|
|monroe township|4000|725000|    0.0|
|   west windsor|2600|585000|    1.0|
|   west windsor|2800|615000|    1.0|
|   west windsor|3300|650000|    1.0|
|   west windsor|3600|710000|    1.0|
|    robinsville|2600|575000|    2.0|
|    robinsville|2900|600000|    2.0|
|    robinsville|3100|620000|    2.0|
|    robinsville|3600|695000|    2.0|
+---------------+----+------+-------+



#### Using One Hot Encoder to convert categorical numeric type columns to Vector type

In [16]:
val encoder = new OneHotEncoder().setInputCol("townInd").setOutputCol("townVec")

encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_112ccab3b4db


In [17]:
val encoder_df = encoder.transform(indexer_df)

encoder_df: org.apache.spark.sql.DataFrame = [town: string, area: int ... 3 more fields]


In [18]:
encoder_df.show()

+---------------+----+------+-------+-------------+
|           town|area| price|townInd|      townVec|
+---------------+----+------+-------+-------------+
|monroe township|2600|550000|    0.0|(2,[0],[1.0])|
|monroe township|3000|565000|    0.0|(2,[0],[1.0])|
|monroe township|3200|610000|    0.0|(2,[0],[1.0])|
|monroe township|3600|680000|    0.0|(2,[0],[1.0])|
|monroe township|4000|725000|    0.0|(2,[0],[1.0])|
|   west windsor|2600|585000|    1.0|(2,[1],[1.0])|
|   west windsor|2800|615000|    1.0|(2,[1],[1.0])|
|   west windsor|3300|650000|    1.0|(2,[1],[1.0])|
|   west windsor|3600|710000|    1.0|(2,[1],[1.0])|
|    robinsville|2600|575000|    2.0|    (2,[],[])|
|    robinsville|2900|600000|    2.0|    (2,[],[])|
|    robinsville|3100|620000|    2.0|    (2,[],[])|
|    robinsville|3600|695000|    2.0|    (2,[],[])|
+---------------+----+------+-------+-------------+



### Assembling all the features to a single vector column "features"

In [19]:
val assembler = new VectorAssembler().setInputCols(Array("townVec","area")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2ba78357b57a


In [20]:
val output = assembler.transform(encoder_df)

output: org.apache.spark.sql.DataFrame = [town: string, area: int ... 4 more fields]


In [21]:
output.show()

+---------------+----+------+-------+-------------+----------------+
|           town|area| price|townInd|      townVec|        features|
+---------------+----+------+-------+-------------+----------------+
|monroe township|2600|550000|    0.0|(2,[0],[1.0])|[1.0,0.0,2600.0]|
|monroe township|3000|565000|    0.0|(2,[0],[1.0])|[1.0,0.0,3000.0]|
|monroe township|3200|610000|    0.0|(2,[0],[1.0])|[1.0,0.0,3200.0]|
|monroe township|3600|680000|    0.0|(2,[0],[1.0])|[1.0,0.0,3600.0]|
|monroe township|4000|725000|    0.0|(2,[0],[1.0])|[1.0,0.0,4000.0]|
|   west windsor|2600|585000|    1.0|(2,[1],[1.0])|[0.0,1.0,2600.0]|
|   west windsor|2800|615000|    1.0|(2,[1],[1.0])|[0.0,1.0,2800.0]|
|   west windsor|3300|650000|    1.0|(2,[1],[1.0])|[0.0,1.0,3300.0]|
|   west windsor|3600|710000|    1.0|(2,[1],[1.0])|[0.0,1.0,3600.0]|
|    robinsville|2600|575000|    2.0|    (2,[],[])|[0.0,0.0,2600.0]|
|    robinsville|2900|600000|    2.0|    (2,[],[])|[0.0,0.0,2900.0]|
|    robinsville|3100|620000|    2

In [22]:
val final_data = output.select("price","features")

final_data: org.apache.spark.sql.DataFrame = [price: int, features: vector]


In [23]:
final_data.show()

+------+----------------+
| price|        features|
+------+----------------+
|550000|[1.0,0.0,2600.0]|
|565000|[1.0,0.0,3000.0]|
|610000|[1.0,0.0,3200.0]|
|680000|[1.0,0.0,3600.0]|
|725000|[1.0,0.0,4000.0]|
|585000|[0.0,1.0,2600.0]|
|615000|[0.0,1.0,2800.0]|
|650000|[0.0,1.0,3300.0]|
|710000|[0.0,1.0,3600.0]|
|575000|[0.0,0.0,2600.0]|
|600000|[0.0,0.0,2900.0]|
|620000|[0.0,0.0,3100.0]|
|695000|[0.0,0.0,3600.0]|
+------+----------------+



### Splitting the resultant data into training data and testing data,

<code>
<b>Training data is to train the model</b>
<b>Testing data is to test the builted model</b>
</code>

In [24]:
val Array(train_data,test_data) = final_data.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [price: int, features: vector]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [price: int, features: vector]


In [25]:
final_data.describe().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|                13|
|   mean| 629230.7692307692|
| stddev|57621.109913748696|
|    min|            550000|
|    max|            725000|
+-------+------------------+



In [26]:
train_data.describe().show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|                8|
|   mean|         638750.0|
| stddev|52218.63378417436|
|    min|           575000|
|    max|           710000|
+-------+-----------------+



In [27]:
test_data.describe().show()

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|                5|
|   mean|         614000.0|
| stddev|68684.05928597989|
|    min|           550000|
|    max|           725000|
+-------+-----------------+



### Creating a linear regression model object

In [28]:
val lr = new LinearRegression().setLabelCol("price").setFeaturesCol("features")

lr: org.apache.spark.ml.regression.LinearRegression = linReg_18a6c5262406


### Creating a linear regression model and fitting the training data to it

In [29]:
val lrModel = lr.fit(train_data)

2019-12-27 12:44:24 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-27 12:44:24 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2019-12-27 12:44:24 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2019-12-27 12:44:24 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_18a6c5262406


### Getting the training summary of the created model

In [31]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@37ce4964


In [32]:
trainingSummary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 2978.3393501781393|
| 1245.4873646186898|
| -7545.126353791449|
|  7563.176895305514|
|-16642.599277977366|
|                0.0|
|  4566.787003612495|
| 7833.9350180530455|
+-------------------+



### Errors

In [33]:
println(s"Mean Absolute Error: ${trainingSummary.meanAbsoluteError}")
println(s"Mean Squared Error: ${trainingSummary.meanSquaredError}")
println(s"Root Mean Squared Error: ${trainingSummary.rootMeanSquaredError}")
println(s"R Squared Error: ${trainingSummary.r2}")

Mean Absolute Error: 6046.931407942087
Mean Squared Error: 6.046931407942254E7
Root Mean Squared Error: 7776.201777180331
R Squared Error: 0.9746559521867515


### Evaluating the model against test data

In [34]:
val test_results = lrModel.evaluate(test_data)

test_results: org.apache.spark.ml.regression.LinearRegressionSummary = org.apache.spark.ml.regression.LinearRegressionSummary@44153afe


### Getting the co-effecients and intercept

In [35]:
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

Coefficients: [-10433.212996387569,11732.851985559524,118.41155234656571]
Intercept: 264151.62454875093


### Getting the residuals

In [38]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -11588.44765343424|
| -43953.06859206059|
| -22635.37906137365|
|-11227.436823104625|
|-2364.6209386262344|
+-------------------+



### Evaluating the model by checking the different types of error

In [39]:
println(s"Mean Absolute Error: ${test_results.meanAbsoluteError}")
println(s"Mean Squared Error: ${test_results.meanSquaredError}")
println(s"Root Mean Squared Error: ${test_results.rootMeanSquaredError}")
println(s"R Squared Error: ${test_results.r2}")

Mean Absolute Error: 18353.790613719866
Mean Squared Error: 5.420343025454074E8
Root Mean Squared Error: 23281.630152233916
R Squared Error: 0.8563767083875444


### Getting the predictions from the builted model without label column

In [40]:
val unlabelled_data = test_data.select("features")

unlabelled_data: org.apache.spark.sql.DataFrame = [features: vector]


In [41]:
val predictions = lrModel.transform(unlabelled_data)

predictions: org.apache.spark.sql.DataFrame = [features: vector, prediction: double]


In [42]:
predictions.show()

+----------------+-----------------+
|        features|       prediction|
+----------------+-----------------+
|[1.0,0.0,2600.0]|561588.4476534342|
|[1.0,0.0,3000.0]|608953.0685920606|
|[1.0,0.0,3200.0]|632635.3790613736|
|[0.0,0.0,3100.0]|631227.4368231046|
|[1.0,0.0,4000.0]|727364.6209386262|
+----------------+-----------------+



### Stopping the created spark session

In [43]:
spark.stop()

## Thank You!