## Predicting the price of an house using Linear Regression Model and using One Hot Encoder for categorical columns

In [1]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [2]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('homeprice').getOrCreate()

In [3]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors

In [4]:
# Using Spark to read in the homeprices csv file
data = spark.read.csv('homeprices.csv', header=True, inferSchema=True)

In [5]:
# Printing the first few rows of the dataframe
data.show(4)

+---------------+----+------+
|           town|area| price|
+---------------+----+------+
|monroe township|2600|550000|
|monroe township|3000|565000|
|monroe township|3200|610000|
|monroe township|3600|680000|
+---------------+----+------+
only showing top 4 rows



In [6]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)



In [7]:
data.groupBy('town').count().show()

+---------------+-----+
|           town|count|
+---------------+-----+
|monroe township|    5|
|   west windsor|    4|
|    robinsville|    4|
+---------------+-----+



**Converting the categotical column 'town' from String type to Vector form**

In [8]:
#Using String Indexer to convert categorical string columns to numerical type
townIndexer = StringIndexer(inputCol='town', outputCol='townInd')

In [9]:
#Using One Hot Encoder to convert categorical numeric type columns to Vector type
townEncoder = OneHotEncoder(inputCol='townInd', outputCol='townVec')

In [10]:
#Assembling all the features to a single vector column "features"
assembler = VectorAssembler(inputCols=['townVec','area'], outputCol='features')

__Splitting the resultant data into training data and testing data, Training data is to train the model, Testing data is to test the builted model__

In [11]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [12]:
train_data.describe().show()

+-------+---------------+------------------+------------------+
|summary|           town|              area|             price|
+-------+---------------+------------------+------------------+
|  count|             10|                10|                10|
|   mean|           null|            3150.0|          627500.0|
| stddev|           null|467.26152562920623|58653.881182559235|
|    min|monroe township|              2600|            550000|
|    max|   west windsor|              4000|            725000|
+-------+---------------+------------------+------------------+



In [13]:
test_data.describe().show()

+-------+---------------+------------------+-----------------+
|summary|           town|              area|            price|
+-------+---------------+------------------+-----------------+
|  count|              3|                 3|                3|
|   mean|           null|3133.3333333333335|         635000.0|
| stddev|           null| 503.3222956847166|66143.78277661477|
|    min|monroe township|              2600|           585000|
|    max|   west windsor|              3600|           710000|
+-------+---------------+------------------+-----------------+



In [14]:
#Creating a Linear regression model object
lr = LinearRegression(labelCol='price', featuresCol='features')

In [15]:
#Setting Up the Pipeline
from pyspark.ml import Pipeline

In [16]:
pipeline = Pipeline(stages=[townIndexer,townEncoder,assembler,lr])

In [17]:
#Fitting the pipeline to training set.
model = pipeline.fit(train_data)

In [18]:
#Getting Results on Test Set
results = model.transform(test_data)

In [19]:
results.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- townInd: double (nullable = false)
 |-- townVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [20]:
results.select('town','area','price','prediction').show()

+---------------+----+------+-----------------+
|           town|area| price|       prediction|
+---------------+----+------+-----------------+
|monroe township|3200|610000| 617314.049586777|
|   west windsor|2600|585000|575413.2231404963|
|   west windsor|3600|710000|702272.7272727268|
+---------------+----+------+-----------------+



## Evaluating the model

In [21]:
output = model.transform(data)

In [22]:
output.count()

13

In [23]:
output.printSchema()

root
 |-- town: string (nullable = true)
 |-- area: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- townInd: double (nullable = false)
 |-- townVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [24]:
output = output.select('price','features')

In [25]:
train_data_2,test_data_2 = output.randomSplit([0.7,0.3])

In [26]:
#Creating a linear regression model object
lr_2 = LinearRegression(labelCol='price', featuresCol='features')

In [27]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data_2)

__Getting the training summary of the created model__

In [28]:
training_summary = lrModel.summary

In [29]:
training_summary.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
| 10631.188118812395|
|-23873.762376237544|
| 1881.1881188125117|
+-------------------+
only showing top 3 rows



In [30]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  10214.521452145353
Mean Squared Error:  156483773.3773375
Root Mean Squared Error:  12509.347440108037
R Squared Error:  0.9492586643572285


In [31]:
# Evaluating the model against test data
test_results_2 = lrModel.evaluate(test_data_2)

In [32]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [-40581.68316831663,-6831.6831683167375,123.76237623762493], Intercept: 258168.3168316794


In [33]:
# Getting the residuals
test_results_2.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
|  5049.504950495786|
|-10247.524752474972|
|           -15000.0|
+-------------------+
only showing top 3 rows



In [34]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results_2.meanAbsoluteError)
print("Mean Squared Error: ",test_results_2.meanSquaredError)
print("Root Mean Squared Error: ",test_results_2.rootMeanSquaredError)
print("R Squared Error: ",test_results_2.r2)

Mean Absolute Error:  10665.222772277048
Mean Squared Error:  127093583.04332224
Root Mean Squared Error:  11273.57898110987
R Squared Error:  0.9577235482600176


__Getting the predictions from the builted model without label column__

In [35]:
unlabelled_data = test_data_2.select('features')

In [36]:
predictions = lrModel.transform(unlabelled_data)

In [37]:
predictions.show(5)

+----------------+-----------------+
|        features|       prediction|
+----------------+-----------------+
|[0.0,0.0,2600.0]|579950.4950495042|
|[0.0,1.0,2900.0]| 610247.524752475|
|[0.0,1.0,3100.0]|         635000.0|
|[1.0,0.0,4000.0]|712636.1386138626|
+----------------+-----------------+



In [None]:
#Closing spark session
spark.stop()