## Predicting the price of an house using Linear Regression Model and using One Hot Encoder for categorical columns

In [47]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [48]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('homeprice').getOrCreate()

In [49]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors

In [50]:
# Using Spark to read in the homeprices csv file
data = spark.read.csv('homeprices.csv', header=True, inferSchema=True)

In [51]:
# Printing the first few rows of the dataframe
data.show(4)

+---------------+----+------+
|           town|area| price|
+---------------+----+------+
|monroe township|2600|550000|
|monroe township|3000|565000|
|monroe township|3200|610000|
|monroe township|3600|680000|
+---------------+----+------+
only showing top 4 rows



In [52]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)



In [53]:
data.groupBy('town').count().show()

+---------------+-----+
|           town|count|
+---------------+-----+
|monroe township|    5|
|   west windsor|    4|
|    robinsville|    4|
+---------------+-----+



**Converting the categotical column 'town' from String type to Vector form**

In [54]:
#Using String Indexer to convert categorical string columns to numerical type
townIndexer = StringIndexer(inputCol='town', outputCol='townInd')

In [55]:
#Using One Hot Encoder to convert categorical numeric type columns to Vector type
townEncoder = OneHotEncoder(inputCol='townInd', outputCol='townVec')

In [56]:
#Assembling all the features to a single vector column "features"
assembler = VectorAssembler(inputCols=['townVec','area'], outputCol='features')

__Splitting the resultant data into training data and testing data, Training data is to train the model, Testing data is to test the builted model__

In [57]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [58]:
train_data.describe().show()

+-------+---------------+-----------------+-----------------+
|summary|           town|             area|            price|
+-------+---------------+-----------------+-----------------+
|  count|              8|                8|                8|
|   mean|           null|           3025.0|         621250.0|
| stddev|           null|436.7084676466506|51112.62075065218|
|    min|monroe township|             2600|           550000|
|    max|   west windsor|             3600|           695000|
+-------+---------------+-----------------+-----------------+



In [59]:
test_data.describe().show()

+-------+---------------+-----------------+-----------------+
|summary|           town|             area|            price|
+-------+---------------+-----------------+-----------------+
|  count|              5|                5|                5|
|   mean|           null|           3340.0|         642000.0|
| stddev|           null|456.0701700396552|71116.10225539643|
|    min|monroe township|             2900|           565000|
|    max|   west windsor|             4000|           725000|
+-------+---------------+-----------------+-----------------+



In [60]:
#Creating a Linear regression model object
lr = LinearRegression(labelCol='price', featuresCol='features')

In [61]:
#Setting Up the Pipeline
from pyspark.ml import Pipeline

In [62]:
pipeline = Pipeline(stages=[townIndexer,townEncoder,assembler,lr])

In [63]:
#Fitting the pipeline to training set.
model = pipeline.fit(train_data)

In [64]:
#Getting Results on Test Set
results = model.transform(test_data)

In [65]:
results.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- townInd: double (nullable = false)
 |-- townVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [66]:
results.select('town','area','price','prediction').show()

+---------------+----+------+-----------------+
|           town|area| price|       prediction|
+---------------+----+------+-----------------+
|monroe township|3000|565000|603253.9682539684|
|monroe township|3200|610000|626746.0317460317|
|monroe township|4000|725000|720714.2857142852|
|    robinsville|2900|600000|606507.9365079366|
|   west windsor|3600|710000|698888.8888888884|
+---------------+----+------+-----------------+



## Evaluating the model

In [67]:
output = results.select('price','features')

In [68]:
train_data_2,test_data_2 = output.randomSplit([0.7,0.3])

In [69]:
#Creating a linear regression model object
lr_2 = LinearRegression(labelCol='price', featuresCol='features')

In [70]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data_2)

__Getting the training summary of the created model__

In [71]:
training_summary = lrModel.summary

In [72]:
training_summary.residuals.show(3)

+--------------------+
|           residuals|
+--------------------+
|-0.00398086046334...|
|0.002828869968652...|
|0.001151990378275...|
+--------------------+



In [73]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  0.0026539069367572665
Mean Squared Error:  8.392279053265513e-06
Root Mean Squared Error:  0.002896943053162335
R Squared Error:  0.9999999999999978


In [74]:
# Evaluating the model against test data
test_results_2 = lrModel.evaluate(test_data_2)

In [75]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [82374.99666620359,45437.492890427115,104.3749970015758], Intercept: 251875.01297613306


In [76]:
# Getting the residuals
test_results_2.residuals.show(3)

+------------------+
|         residuals|
+------------------+
|24124.996618824312|
|55624.999017563765|
+------------------+



In [77]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results_2.meanAbsoluteError)
print("Mean Squared Error: ",test_results_2.meanSquaredError)
print("Root Mean Squared Error: ",test_results_2.rootMeanSquaredError)
print("R Squared Error: ",test_results_2.r2)

Mean Absolute Error:  39874.99781819404
Mean Squared Error:  1838077988.7811272
Root Mean Squared Error:  42872.81176667944
R Squared Error:  0.4440595875142148


__Getting the predictions from the builted model without label column__

In [78]:
unlabelled_data = test_data_2.select('features')

In [79]:
predictions = lrModel.transform(unlabelled_data)

In [80]:
predictions.show(5)

+----------------+-----------------+
|        features|       prediction|
+----------------+-----------------+
|[0.0,0.0,3200.0]|585875.0033811757|
|[0.0,0.0,4000.0]|669375.0009824362|
+----------------+-----------------+



In [None]:
#Closing spark session
spark.stop()