# To examine the USA Housing dataset and to build a linear regression model which will predict the price of the house.

In [33]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [34]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('USA_Housing').getOrCreate()

In [35]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [36]:
# Using Spark to read in the Ecommerce Customers csv file
data = spark.read.csv('USA_Housing.csv', header=True, inferSchema=True)

In [37]:
# Printing the first row of the dataframe
data.head()

Row(Avg Area Income=79545.45857431678, Avg Area House Age=5.682861321615587, Avg Area Number of Rooms=7.009188142792237, Avg Area Number of Bedrooms=4.09, Area Population=23086.800502686456, Price=1059033.5578701235)

In [38]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



__Filtering the string columns and converting the dataframe to ML acceptable format
             --->    i.e., ("label","features")__

In [39]:
assembler = VectorAssembler(inputCols=['Avg Area Income','Avg Area House Age','Avg Area Number of Rooms'
                                ,'Avg Area Number of Bedrooms','Area Population'],outputCol='features')

In [40]:
output = assembler.transform(data)

In [41]:
output.head()

Row(Avg Area Income=79545.45857431678, Avg Area House Age=5.682861321615587, Avg Area Number of Rooms=7.009188142792237, Avg Area Number of Bedrooms=4.09, Area Population=23086.800502686456, Price=1059033.5578701235, features=DenseVector([79545.4586, 5.6829, 7.0092, 4.09, 23086.8005]))

In [42]:
final_data = output.select('Price','features')

In [43]:
final_data.show(3, truncate=False)

+------------------+-------------------------------------------------------------------------------+
|Price             |features                                                                       |
+------------------+-------------------------------------------------------------------------------+
|1059033.5578701235|[79545.45857431678,5.682861321615587,7.009188142792237,4.09,23086.800502686456]|
|1505890.91484695  |[79248.64245482568,6.0028998082752425,6.730821019094919,3.09,40173.07217364482]|
|1058987.9878760849|[61287.067178656784,5.865889840310001,8.512727430375099,5.13,36882.15939970458]|
+------------------+-------------------------------------------------------------------------------+
only showing top 3 rows



#Splitting the resultane data into training data and testing data
#Training data is to train the model
#Testing data is to test the builted model

In [44]:
#Splitting the total data to 70% and 30% for training data and testing data respectively
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [45]:
train_data.describe().show()

+-------+------------------+
|summary|             Price|
+-------+------------------+
|  count|              3526|
|   mean|1236498.3398118548|
| stddev| 351554.3120257839|
|    min|15938.657923287848|
|    max|2469065.5941747027|
+-------+------------------+



In [46]:
test_data.describe().show()

+-------+------------------+
|summary|             Price|
+-------+------------------+
|  count|              1474|
|   mean|1221485.8375408286|
| stddev| 356726.6971634834|
|    min|143027.36445248185|
|    max| 2330289.700683439|
+-------+------------------+



In [47]:
#Creating a linear regression model object
lr = LinearRegression(labelCol='Price', featuresCol='features')

In [48]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data)

__Getting the training summary of the created model__

In [49]:
training_summary = lrModel.summary

In [50]:
training_summary.residuals.show(3)

+------------------+
|         residuals|
+------------------+
|-53419.05389178065|
|-76696.00407438986|
|-69976.39531648408|
+------------------+
only showing top 3 rows



In [51]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  79602.59809459849
Mean Squared Error:  9852366846.36714
Root Mean Squared Error:  99259.08948991593
R Squared Error:  0.9202595095683607


In [52]:
# Evaluating the model against test data
test_results = lrModel.evaluate(test_data)

In [53]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [21.419771182355095,165460.09053620198,122259.79058743107,1328.1781826869992,15.091809751001747], Intercept: -2631923.368746153


In [54]:
# Getting the residuals
test_results.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
| 52175.880512381555|
| -76631.90844711327|
|-183476.56952701052|
+-------------------+
only showing top 3 rows



In [55]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results.meanAbsoluteError)
print("Mean Squared Error: ",test_results.meanSquaredError)
print("Root Mean Squared Error: ",test_results.rootMeanSquaredError)
print("R Squared Error: ",test_results.r2)

Mean Absolute Error:  85641.8175008853
Mean Squared Error:  11120702946.507998
Root Mean Squared Error:  105454.74359415036
R Squared Error:  0.9125508180589319


__Getting the predictions from the builted model without label column__

In [56]:
unlabelled_data = test_data.select('features')

In [57]:
predictions = lrModel.transform(unlabelled_data)

In [58]:
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[35963.3308090628...|  90851.4839401003|
|[40366.6162912572...|228703.78319667373|
|[48904.9832693168...|385374.65609950665|
|[52588.6836452133...| 483264.5965482895|
|[53562.4035410195...| 329356.9690686646|
+--------------------+------------------+
only showing top 5 rows



In [59]:
# Stopping the created spark session
spark.stop()