### Initialize and create a spark session

In [2]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("carprice").getOrCreate()

2019-12-27 13:32:55 WARN  SparkSession$Builder:66 - Using an existing SparkSession; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5d745659


### Initialize Logger

In [3]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML for Linear Regression

In [4]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


### Using Spark to read in the carprices csv file

In [5]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("carprices.csv")

data: org.apache.spark.sql.DataFrame = [Car Model: string, Mileage: int ... 2 more fields]


### Print the count of the Dataframe

In [6]:
data.count()

res1: Long = 13


### Print the count of the Dataframe by dropping duplicates

In [7]:
data.na.drop().count()

res2: Long = 13


### Printing the first 5 rows of the dataframe

In [8]:
for (row <- data.head(5)){
    println(row)
}

[BMW X5,69000,18000,6]
[BMW X5,35000,34000,3]
[BMW X5,57000,26100,5]
[BMW X5,22500,40000,2]
[BMW X5,46000,31500,4]


### Show

In [11]:
data.show(false)

+---------------------+-------+-------------+--------+
|Car Model            |Mileage|Sell Price($)|Age(yrs)|
+---------------------+-------+-------------+--------+
|BMW X5               |69000  |18000        |6       |
|BMW X5               |35000  |34000        |3       |
|BMW X5               |57000  |26100        |5       |
|BMW X5               |22500  |40000        |2       |
|BMW X5               |46000  |31500        |4       |
|Audi A5              |59000  |29400        |5       |
|Audi A5              |52000  |32000        |5       |
|Audi A5              |72000  |19300        |6       |
|Audi A5              |91000  |12000        |8       |
|Mercedez Benz C class|67000  |22000        |6       |
|Mercedez Benz C class|83000  |20000        |7       |
|Mercedez Benz C class|79000  |21000        |7       |
|Mercedez Benz C class|59000  |33000        |5       |
+---------------------+-------+-------------+--------+



### Schema

In [12]:
data.printSchema

root
 |-- Car Model: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Sell Price($): integer (nullable = true)
 |-- Age(yrs): integer (nullable = true)



In [14]:
data.groupBy("Car Model").count().show(false)

+---------------------+-----+
|Car Model            |count|
+---------------------+-----+
|BMW X5               |5    |
|Audi A5              |4    |
|Mercedez Benz C class|4    |
+---------------------+-----+



### Converting the categotical column `Car Model` from String type to Vector form

#### Using String Indexer to convert categorical string columns to numerical type

In [16]:
val indexer = new StringIndexer().setInputCol("Car Model").setOutputCol("Car_Model_Ind")

indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_ba5f8db788f3


#### Using One Hot Encoder to convert categorical numeric type columns to Vector type

In [17]:
val encoder = new OneHotEncoder().setInputCol("Car_Model_Ind").setOutputCol("Car_Model_Vec")

encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_deb047784794


### Assembling all the features to a single vector column "features"

In [18]:
val assembler = new VectorAssembler().setInputCols(Array("Mileage","Age(yrs)","Car_Model_Vec")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_5056228c6111


### Splitting the resultant data into training data and testing data,

<code>
<b>Training data is to train the model</b>
<b>Testing data is to test the builted model</b>
</code>

In [19]:
val Array(train_data,test_data) = data.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Car Model: string, Mileage: int ... 2 more fields]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Car Model: string, Mileage: int ... 2 more fields]


In [21]:
data.describe().show()

+-------+--------------------+------------------+------------------+------------------+
|summary|           Car Model|           Mileage|     Sell Price($)|          Age(yrs)|
+-------+--------------------+------------------+------------------+------------------+
|  count|                  13|                13|                13|                13|
|   mean|                null| 60884.61538461538|26023.076923076922|5.3076923076923075|
| stddev|                null|19185.665054663692| 8003.661021282057|1.6525039276108333|
|    min|             Audi A5|             22500|             12000|                 2|
|    max|Mercedez Benz C c...|             91000|             40000|                 8|
+-------+--------------------+------------------+------------------+------------------+



In [22]:
train_data.describe().show()

+-------+--------------------+------------------+-----------------+-----------------+
|summary|           Car Model|           Mileage|    Sell Price($)|         Age(yrs)|
+-------+--------------------+------------------+-----------------+-----------------+
|  count|                   8|                 8|                8|                8|
|   mean|                null|           58062.5|          26475.0|              5.0|
| stddev|                null|21374.947786068224|9438.939105035663|1.851640199545103|
|    min|             Audi A5|             22500|            12000|                2|
|    max|Mercedez Benz C c...|             91000|            40000|                8|
+-------+--------------------+------------------+-----------------+-----------------+



In [23]:
test_data.describe().show()

+-------+--------------------+----------------+-----------------+------------------+
|summary|           Car Model|         Mileage|    Sell Price($)|          Age(yrs)|
+-------+--------------------+----------------+-----------------+------------------+
|  count|                   5|               5|                5|                 5|
|   mean|                null|         65400.0|          25300.0|               5.8|
| stddev|                null|16226.5215003093|5932.958789676531|1.3038404810405297|
|    min|             Audi A5|           46000|            20000|                 4|
|    max|Mercedez Benz C c...|           83000|            32000|                 7|
+-------+--------------------+----------------+-----------------+------------------+



### Creating a Linear regression model object

In [24]:
val lr = new LinearRegression().setLabelCol("Sell Price($)").setFeaturesCol("features")

lr: org.apache.spark.ml.regression.LinearRegression = linReg_3f3b311f0053


### Setting Up the Pipeline

In [25]:
import org.apache.spark.ml.Pipeline

import org.apache.spark.ml.Pipeline


In [26]:
val pipeline = new Pipeline().setStages(Array(indexer,encoder,assembler,lr))

pipeline: org.apache.spark.ml.Pipeline = pipeline_6ce4c9b6893a


#### Fitting the pipeline to training set.

In [27]:
val pipelineModel = pipeline.fit(train_data)

2019-12-27 14:05:28 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-27 14:05:28 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2019-12-27 14:05:28 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2019-12-27 14:05:28 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


pipelineModel: org.apache.spark.ml.PipelineModel = pipeline_6ce4c9b6893a


### Getting Results on Test Set

In [28]:
val results = pipelineModel.transform(test_data)

results: org.apache.spark.sql.DataFrame = [Car Model: string, Mileage: int ... 6 more fields]


In [31]:
results.select("features","Sell Price($)","prediction").show(false)

+---------------------+-------------+------------------+
|features             |Sell Price($)|prediction        |
+---------------------+-------------+------------------+
|[52000.0,5.0,0.0,1.0]|32000        |33799.768697000574|
|[46000.0,4.0,1.0,0.0]|31500        |29410.10023130288 |
|[67000.0,6.0,0.0,0.0]|22000        |30638.357748654154|
|[79000.0,7.0,0.0,0.0]|21000        |24599.92289900026 |
|[83000.0,7.0,0.0,0.0]|20000        |20923.130300692224|
+---------------------+-------------+------------------+



### Evaluating the model

In [37]:
var output = pipelineModel.transform(data)

output: org.apache.spark.sql.DataFrame = [Car Model: string, Mileage: int ... 6 more fields]


In [38]:
output.count()

res21: Long = 13


In [39]:
output.printSchema

root
 |-- Car Model: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Sell Price($): integer (nullable = true)
 |-- Age(yrs): integer (nullable = true)
 |-- Car_Model_Ind: double (nullable = false)
 |-- Car_Model_Vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [40]:
output = output.select("Sell Price($)","features")

output: org.apache.spark.sql.DataFrame = [Sell Price($): int, features: vector]


In [41]:
val Array(train_data_2,test_data_2) = output.randomSplit(Array(0.7,0.3))

train_data_2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Sell Price($): int, features: vector]
test_data_2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Sell Price($): int, features: vector]


In [42]:
output.describe().show()

+-------+------------------+
|summary|     Sell Price($)|
+-------+------------------+
|  count|                13|
|   mean|26023.076923076922|
| stddev| 8003.661021282057|
|    min|             12000|
|    max|             40000|
+-------+------------------+



In [43]:
train_data_2.describe().show()

+-------+-----------------+
|summary|    Sell Price($)|
+-------+-----------------+
|  count|               10|
|   mean|          25920.0|
| stddev|8715.477930415265|
|    min|            12000|
|    max|            40000|
+-------+-----------------+



In [44]:
test_data_2.describe().show()

+-------+------------------+
|summary|     Sell Price($)|
+-------+------------------+
|  count|                 3|
|   mean|26366.666666666668|
| stddev| 6504.101270224299|
|    min|             20000|
|    max|             33000|
+-------+------------------+



### Creating a linear regression model object

In [45]:
val lr_2 = new LinearRegression().setLabelCol("Sell Price($)").setFeaturesCol("features")

lr_2: org.apache.spark.ml.regression.LinearRegression = linReg_9fe28b98cee8


### Creating a linear regression model and fitting the training data to it

In [46]:
val lrModel = lr_2.fit(train_data_2)

lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_9fe28b98cee8


### Getting the training summary of the created model

In [49]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@7f6fae47


In [50]:
trainingSummary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -594.1238001475867|
| -599.0155057838274|
|-1557.2298793995215|
|  2393.551562884517|
|-2393.5515628846406|
|  2093.446960374051|
| 1988.9982771351388|
|  57.90671917310101|
| -635.8848141767448|
| -754.0979571745411|
+-------------------+



### Errors

In [51]:
println(s"Mean Absolute Error: ${trainingSummary.meanAbsoluteError}")
println(s"Mean Squared Error: ${trainingSummary.meanSquaredError}")
println(s"Root Mean Squared Error: ${trainingSummary.rootMeanSquaredError}")
println(s"R Squared Error: ${trainingSummary.r2}")

Mean Absolute Error: 1306.780703913367
Mean Squared Error: 2390994.646812703
Root Mean Squared Error: 1546.2841416805331
R Squared Error: 0.9650253256584981


### Evaluating the model against test data

In [52]:
val test_results_2 = lrModel.evaluate(test_data_2)

test_results_2: org.apache.spark.ml.regression.LinearRegressionSummary = org.apache.spark.ml.regression.LinearRegressionSummary@1502f5d7


### Getting the co-effecients and intercept

In [53]:
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

Coefficients: [-0.6622200344572781,2159.537287718177,-4470.095988186261,-225.221511198732]
Intercept: 55805.070145213205


### Getting the residuals

In [54]:
test_results_2.residuals.show()

+------------------+
|         residuals|
+------------------+
| 4042.431700713627|
|1713.8813684470188|
| 5468.225449175319|
+------------------+



### Evaluating the model by checking the different types of error

In [55]:
println(s"Mean Absolute Error: ${test_results_2.meanAbsoluteError}")
println(s"Mean Squared Error: ${test_results_2.meanSquaredError}")
println(s"Root Mean Squared Error: ${test_results_2.rootMeanSquaredError}")
println(s"R Squared Error: ${test_results_2.r2}")

Mean Absolute Error: 3741.512839445322
Mean Squared Error: 1.6393377654350972E7
Root Mean Squared Error: 4048.873627856391
R Squared Error: 0.4187203573825594


### Getting the predictions from the builted model without label column

In [56]:
val unlabelled_data = test_data_2.select("features")

unlabelled_data: org.apache.spark.sql.DataFrame = [features: vector]


In [57]:
val predictions = lrModel.transform(unlabelled_data)

predictions: org.apache.spark.sql.DataFrame = [features: vector, prediction: double]


In [59]:
predictions.show(false)

+---------------------+------------------+
|features             |prediction        |
+---------------------+------------------+
|[83000.0,7.0,0.0,0.0]|15957.568299286373|
|[57000.0,5.0,1.0,0.0]|24386.11863155298 |
|[59000.0,5.0,0.0,0.0]|27531.77455082468 |
+---------------------+------------------+



### Stopping the created spark session

In [60]:
spark.stop()

## Thank You!