In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
# !pip install pyspark
from pyspark.sql  import SparkSession 

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 65kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.9MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=e9c6d7b1f3e68356ea50271bff1fb2514ef727d23abdb01e625b8fd38ba9bee0
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [14]:
spark = (
    SparkSession.builder.appName("Car_Price_Random_Forest").getOrCreate()
)

In [15]:
car_price = (
    spark.read.format("csv")
    .option("header", True)
    .load("CarPrice_Assignment.csv")
)

In [16]:
from pyspark.sql.functions import col
numeric_columns = ["wheelbase",	"carlength",	"carwidth",	"carheight",	"curbweight",	
                  	"enginesize",	"boreratio",	"stroke",	"compressionratio",	"horsepower",	"peakrpm",	"citympg",	"highwaympg","price"]
numeric_features = car_price.select([col(c).cast("float") for c in numeric_columns])
# car_price.describe().toPandas().transpose()

In [17]:
features = {}
for i in numeric_columns:
  features[i] = numeric_features.stat.corr("price", i)
  print("Correlation to Price for ", i , numeric_features.stat.corr("price", i))

Correlation to Price for  wheelbase 0.577815609013954
Correlation to Price for  carlength 0.6829200061793368
Correlation to Price for  carwidth 0.7593252732789826
Correlation to Price for  carheight 0.11933627096290873
Correlation to Price for  curbweight 0.8353048796203731
Correlation to Price for  enginesize 0.8741448022848783
Correlation to Price for  boreratio 0.5531732639743967
Correlation to Price for  stroke 0.07944309329818429
Correlation to Price for  compressionratio 0.06798351616221464
Correlation to Price for  horsepower 0.8081388231007026
Correlation to Price for  peakrpm -0.0852671497816066
Correlation to Price for  citympg -0.6857513366309157
Correlation to Price for  highwaympg -0.6975990921640883
Correlation to Price for  price 1.0


In [18]:
# peakrpm , stroke 
features = set(numeric_features.columns) -set(['price', 'peakrpm', 'stroke' ])
features = list(features)
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = features, outputCol = 'features')
car_price = vectorAssembler.transform(numeric_features)
car_price = car_price.select(["features", "price"])


In [19]:
from pyspark.ml.feature import Normalizer
nomalizer = Normalizer(inputCol = "features", outputCol = "normal_features", p = 1.0)
nomaizer_data = nomalizer.transform(car_price)
nomaizer_data.show()

+--------------------+---------+--------------------+
|            features|    price|     normal_features|
+--------------------+---------+--------------------+
|[64.0999984741211...|  13495.0|[0.01990825384693...|
|[64.0999984741211...|  16500.0|[0.01990825384693...|
|[65.5,9.0,2.68000...|  16500.0|[0.01835103999315...|
|[66.1999969482421...|  13950.0|[0.02197809389148...|
|[66.4000015258789...|  17450.0|[0.01884816198252...|
|[66.3000030517578...|  15250.0|[0.02068520209184...|
|[71.4000015258789...|  17710.0|[0.01999277613382...|
|[71.4000015258789...|  18920.0|[0.01939537539296...|
|[71.4000015258789...|  23875.0|[0.01863631300217...|
|[67.9000015258789...|17859.168|[0.01791684408946...|
|[64.8000030517578...|  16430.0|[0.02113916715578...|
|[64.8000030517578...|  16925.0|[0.02113916715578...|
|[64.8000030517578...|  20970.0|[0.01876406305937...|
|[64.8000030517578...|  21105.0|[0.01846990602881...|
|[66.9000015258789...|  24565.0|[0.01754795562038...|
|[66.9000015258789...|  3076

In [20]:
splits = nomaizer_data.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

In [21]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
dt  = DecisionTreeRegressor(featuresCol = "normal_features", labelCol  = "price")
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(labelCol = "price", predictionCol = "prediction", metricName = "rmse")


In [22]:
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 3375.19


In [24]:
# It is Overfitting, So we must to use RandomForest Model and cross validation
from sklearn.metrics import accuracy_score, r2_score
dt_predictions = dt_model.transform(test_df)
acc = dt_predictions.select("price", "prediction").toPandas()
print("Test_Accuracy",r2_score(acc["price"], acc["prediction"]))
dt_predictions.select("price", "prediction").toPandas().info()
dt_predictions = dt_model.transform(train_df)
from sklearn.metrics import accuracy_score, r2_score
dt_predictions = dt_model.transform(train_df)
acc = dt_predictions.select("price", "prediction").toPandas()
r2_score(acc["price"], acc["prediction"])
print("Train_Accuracy", r2_score(acc["price"], acc["prediction"]))

Test_Accuracy 0.7949995937090956
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   price       38 non-null     float32
 1   prediction  38 non-null     float64
dtypes: float32(1), float64(1)
memory usage: 584.0 bytes
Train_Accuracy 0.9654387422638934


In [54]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

# peakrpm , stroke 
features = set(numeric_features.columns) -set(['price', 'peakrpm', 'stroke' ])
features = list(features)

vectorAssembler = VectorAssembler(inputCols = features, outputCol = 'features')
car_price = vectorAssembler.transform(numeric_features)
# car_price = car_price.select(["features", "price"])

nomalizer_model = Normalizer(inputCol = "features", outputCol = "normal_features", p = 1.0)

random_forest = RandomForestRegressor(featuresCol=nomalizer_model.getOutputCol(), labelCol= "price")
rfevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")

pipeline = Pipeline(stages = [nomalizer_model, random_forest])
paramGrid =  (
    ParamGridBuilder()
    # .addGrid(random_forest.maxDepth, [2, 5, 10])
    .addGrid(random_forest.numTrees, [1000])
    .build()
)
crossval = (
    CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, numFolds=5,evaluator = rfevaluator)
)

In [55]:
cvModel = crossval.fit(car_price)
rfpredictions = cvModel.transform(test_df)
rfpredictions.summary

In [83]:
splits = car_price.randomSplit([0.8, 0.2])
train_df = splits[0]
test_df = splits[1]

In [89]:
rfpredictions = cvModel.transform(train_df)
print('RMSE:', rfevaluator.evaluate(rfpredictions))
acc = rfpredictions.select("price", "prediction").toPandas()
r2_score(acc["price"], acc["prediction"])
print("Train Accuracy", r2_score(acc["price"], acc["prediction"]))


RMSE: 1773.5091982853005
Train Accuracy 0.9485340043534405


In [87]:
rfpredictions = cvModel.transform(test_df)
print('RMSE:', rfevaluator.evaluate(rfpredictions))
acc = rfpredictions.select("price", "prediction").toPandas()
r2_score(acc["price"], acc["prediction"])
print("Test_ Accuracy", r2_score(acc["price"], acc["prediction"]))


RMSE: 1445.923093233406
Test_ Accuracy 0.970199490102587
