<a href="https://colab.research.google.com/github/Prajwal718/Car-Price-Predictor/blob/main/Car_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
!pip install pyspark
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import when,lit,count,isnan,col
from pyspark.ml.feature import VectorAssembler




In [42]:
from pyspark import SparkContext, SparkConf
sc =SparkSession.builder.master("local[*]").getOrCreate()


In [43]:
data = sc.read.csv('/content/data.csv', inferSchema = True, header = True)
data.show(5)
data.printSchema()

+----+----------+----+--------------------+---------+----------------+-----------------+----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
|Make|     Model|Year|    Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type|   Driven_Wheels|Number of Doors|     Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity| MSRP|
+----+----------+----+--------------------+---------+----------------+-----------------+----------------+---------------+--------------------+------------+-------------+-----------+--------+----------+-----+
| BMW|1 Series M|2011|premium unleaded ...|      335|               6|           MANUAL|rear wheel drive|              2|Factory Tuner,Lux...|     Compact|        Coupe|         26|      19|      3916|46135|
| BMW|  1 Series|2011|premium unleaded ...|      300|               6|           MANUAL|rear wheel drive|              2|  Luxury,Performance|     Compact|  Convertible

In [44]:
data.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Make,11914,,,Acura,Volvo
Model,11914,745.5822222222222,1490.8280590623795,1 Series,xD
Year,11914,2010.384337753903,7.5797398875957995,1990,2017
Engine Fuel Type,11911,,,diesel,regular unleaded
Engine HP,11845,249.38607007176023,109.19187025917194,55,1001
Engine Cylinders,11884,5.628828677213059,1.78055934824622,0,16
Transmission Type,11914,,,AUTOMATED_MANUAL,UNKNOWN
Driven_Wheels,11914,,,all wheel drive,rear wheel drive
Number of Doors,11908,3.4360933825999327,0.8813153865835529,2,4


In [45]:
def replace(column, value): return when(column!=value,column).otherwise(lit(None))
data.select([count(when(isnan(c)|col(c).isNull(),c)).alias(c) for c in data.columns]).show()

+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+
|Make|Model|Year|Engine Fuel Type|Engine HP|Engine Cylinders|Transmission Type|Driven_Wheels|Number of Doors|Market Category|Vehicle Size|Vehicle Style|highway MPG|city mpg|Popularity|MSRP|
+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+
|   0|    0|   0|               3|       69|              30|                0|            0|              6|              0|           0|            0|          0|       0|         0|   0|
+----+-----+----+----------------+---------+----------------+-----------------+-------------+---------------+---------------+------------+-------------+-----------+--------+----------+----+



In [46]:
#deleting the column Market Category
data = data.drop("Market Category")
# deleting the all null values
data= data.na.drop()

In [47]:
print((data.count(), len(data.columns)))

(11812, 15)


In [48]:
assembler = VectorAssembler(inputCols = ["Year","Engine HP","Engine Cylinders", "Number of Doors","highway MPG","city mpg","Popularity"],
                            outputCol = "InputAttributes")


In [59]:
from pyspark.ml.regression import RandomForestRegressor
regressor = RandomForestRegressor(featuresCol ='InputAttributes', labelCol = "MSRP")


In [60]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = [assembler,regressor])
pipeline.write().overwrite().save("pipeline_save")
pipelineModel = Pipeline.load('./pipeline_save')
updatedPipelineModel = Pipeline(stages=[assembler] + pipelineModel.getStages())

In [62]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = ParamGridBuilder() .addGrid(regressor.numTrees,[100,500]) .build()

In [61]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
crossval = CrossValidator(estimator = pipelineModel,estimatorParamMaps = paramGrid, evaluator =
                          RegressionEvaluator(labelCol = "MSRP"), numFolds = 3)


In [63]:
from pyspark.ml.feature import VectorAssembler
train_data , test_data = data.randomSplit([0.8,0.2], seed = 123)
cvModel = crossval.fit(train_data)

In [65]:
bestModel = cvModel.bestModel
print(bestModel.stages)

[VectorAssembler_5756bec609e4, RandomForestRegressionModel: uid=RandomForestRegressor_0bdc6b57f46e, numTrees=20, numFeatures=7]


In [75]:
pred = cvModel.transform(test_data)
pred.select("prediction", "MSRP").show()


+------------------+-----+
|        prediction| MSRP|
+------------------+-----+
| 34421.14519682746|28030|
| 37539.01319791064|30550|
|28208.184533479263|29350|
|27392.471803885237|27900|
|27392.471803885237|34890|
|27392.471803885237|32990|
| 4362.040597442717| 2827|
| 4362.040597442717| 3000|
| 3344.309129987598| 3086|
| 3344.309129987598| 3130|
| 4385.952943880063| 3012|
|  4870.15335499411| 3622|
|23819.720568850073|22300|
|21841.003440303346|19400|
| 2976.970539642926| 2042|
|3140.9587786202756| 2144|
| 39083.79290465493|49440|
| 39083.79290465493|52640|
| 39083.79290465493|47440|
| 39083.79290465493|58400|
+------------------+-----+
only showing top 20 rows



In [70]:
from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(labelCol='MSRP')
rmse = eval.evaluate(pred, {eval.metricName: 'rmse'})
mae = eval.evaluate(pred, {eval.metricName: "mae"})
rmse = eval.evaluate(pred, {eval.metricName: 'rmse'})
mae = eval.evaluate(pred, {eval.metricName: 'mae'})
r2 = eval.evaluate(pred, {eval.metricName: 'r2'})

print("RMSE: %.3f" %rmse)
print("MAE: %.3f" %mae)
print("R2: %.3f" %r2)

RMSE: 33910.998
MAE: 9698.520
R2: 0.807
