In [58]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [59]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('carprice').getOrCreate()

In [60]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors

In [61]:
# Using Spark to read in the car prices csv file
data = spark.read.csv('carprices.csv', header=True, inferSchema=True)

In [62]:
# Printing the first few rows of the dataframe
data.show(4)

+---------+-------+-------------+--------+
|Car Model|Mileage|Sell Price($)|Age(yrs)|
+---------+-------+-------------+--------+
|   BMW X5|  69000|        18000|       6|
|   BMW X5|  35000|        34000|       3|
|   BMW X5|  57000|        26100|       5|
|   BMW X5|  22500|        40000|       2|
+---------+-------+-------------+--------+
only showing top 4 rows



In [63]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- Car Model: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Sell Price($): integer (nullable = true)
 |-- Age(yrs): integer (nullable = true)



In [64]:
data.groupBy('Car Model').count().show(truncate=False)

+---------------------+-----+
|Car Model            |count|
+---------------------+-----+
|BMW X5               |5    |
|Audi A5              |4    |
|Mercedez Benz C class|4    |
+---------------------+-----+



**Converting the categotical column 'town' from String type to Vector form**

In [65]:
#Using String Indexer to convert categorical string columns to numerical type
carModelIndexer = StringIndexer(inputCol='Car Model', outputCol='carInd')

In [66]:
#Using One Hot Encoder to convert categorical numeric type columns to Vector type
carModelEncoder = OneHotEncoder(inputCol='carInd', outputCol='carVec')

In [67]:
#Assembling all the features to a single vector column "features"
assembler = VectorAssembler(inputCols=['Mileage','Age(yrs)','carVec'], outputCol='features')

__Splitting the resultant data into training data and testing data, Training data is to train the model, Testing data is to test the builted model__

In [68]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [69]:
train_data.describe().show()

+-------+--------------------+------------------+-----------------+------------------+
|summary|           Car Model|           Mileage|    Sell Price($)|          Age(yrs)|
+-------+--------------------+------------------+-----------------+------------------+
|  count|                  11|                11|               11|                11|
|   mean|                null|60681.818181818184|26381.81818181818|5.2727272727272725|
| stddev|                null|20890.515464287528|8666.581584663953|1.7939291563999449|
|    min|             Audi A5|             22500|            12000|                 2|
|    max|Mercedez Benz C c...|             91000|            40000|                 8|
+-------+--------------------+------------------+-----------------+------------------+



In [70]:
test_data.describe().show()

+-------+--------------------+-----------------+-----------------+------------------+
|summary|           Car Model|          Mileage|    Sell Price($)|          Age(yrs)|
+-------+--------------------+-----------------+-----------------+------------------+
|  count|                   2|                2|                2|                 2|
|   mean|                null|          62000.0|          24050.0|               5.5|
| stddev|                null|7071.067811865475|2899.137802864845|0.7071067811865476|
|    min|              BMW X5|            57000|            22000|                 5|
|    max|Mercedez Benz C c...|            67000|            26100|                 6|
+-------+--------------------+-----------------+-----------------+------------------+



In [71]:
#Creating a Linear regression model object
lr = LinearRegression(labelCol='Sell Price($)', featuresCol='features')

In [72]:
#Setting Up the Pipeline
from pyspark.ml import Pipeline

In [73]:
pipeline = Pipeline(stages=[carModelIndexer,carModelEncoder,assembler,lr])

In [74]:
#Fitting the pipeline to training set.
model = pipeline.fit(train_data)

In [75]:
#Getting Results on Test Set
results = model.transform(test_data)

In [76]:
results.printSchema()

root
 |-- Car Model: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Sell Price($): integer (nullable = true)
 |-- Age(yrs): integer (nullable = true)
 |-- carInd: double (nullable = false)
 |-- carVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [77]:
results.select('Car Model','Mileage','Age(yrs)','Sell Price($)','prediction').show(truncate=False)

+---------------------+-------+--------+-------------+------------------+
|Car Model            |Mileage|Age(yrs)|Sell Price($)|prediction        |
+---------------------+-------+--------+-------------+------------------+
|BMW X5               |57000  |5       |26100        |23953.277215549155|
|Mercedez Benz C class|67000  |6       |22000        |28244.675115861515|
+---------------------+-------+--------+-------------+------------------+



### Evaluating the model

In [78]:
output = model.transform(data)

In [79]:
output.count()

13

In [80]:
output.printSchema()

root
 |-- Car Model: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Sell Price($): integer (nullable = true)
 |-- Age(yrs): integer (nullable = true)
 |-- carInd: double (nullable = false)
 |-- carVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [81]:
output = output.select('Sell Price($)','features')

In [82]:
output.count()

13

In [83]:
train_data_2,test_data_2 = output.randomSplit([0.7,0.3])

In [84]:
train_data_2.count()

11

In [85]:
test_data_2.count()

2

In [86]:
#Creating a linear regression model object
lr_2 = LinearRegression(labelCol='Sell Price($)', featuresCol='features')

In [87]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data_2)

**Getting the training summary of the created model**

In [88]:
training_summary = lrModel.summary

In [89]:
training_summary.residuals.show(3)

+------------------+
|         residuals|
+------------------+
|146.67397260262806|
|-358.8821917808309|
|-2544.980821917001|
+------------------+
only showing top 3 rows



In [90]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  1561.2234122041548
Mean Squared Error:  3656182.665006235
Root Mean Squared Error:  1912.1147102112454
R Squared Error:  0.920978043364779


**Evaluating the model against test data**

In [91]:
# Evaluating the model against test data
test_results_2 = lrModel.evaluate(test_data_2)

In [92]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [-0.380005479452284,-1385.7753424631144,-2501.4630136988053,-7127.578082191823], Intercept: 60021.49041095894


In [93]:
# Getting the residuals
test_results_2.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
| 1629.4410958904045|
|-1572.2383561644965|
+-------------------+



In [94]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results_2.meanAbsoluteError)
print("Mean Squared Error: ",test_results_2.meanSquaredError)
print("Root Mean Squared Error: ",test_results_2.rootMeanSquaredError)
print("R Squared Error: ",test_results_2.r2)

Mean Absolute Error:  1600.8397260274505
Mean Squared Error:  2563505.86678568
Root Mean Squared Error:  1601.0952085324845
R Squared Error:  0.8580757997627305


**Getting the predictions from the builted model without label column**

In [95]:
unlabelled_data = test_data_2.select('features')

In [96]:
predictions = lrModel.transform(unlabelled_data)

In [97]:
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[46000.0,4.0,0.0,...|29870.558904109595|
|[22500.0,2.0,0.0,...|  41572.2383561645|
+--------------------+------------------+



In [None]:
#Closing spark session
spark.stop()