

### Autos Dataset

Linear regression model(Elastic Net) to predict price of a used car taking in the car’s details as input

In [7]:
!pip install pyspark


Defaulting to user installation because normal site-packages is not writeable
Collecting pyspark
  Using cached pyspark-3.5.0.tar.gz (316.9 MB)
Collecting py4j==0.10.9.7
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425366 sha256=c26e2a22288d1e0aa8a976d8d5065151179a1244fe75c22cce681a301ab8cbf8
  Stored in directory: /Users/saiomkarkandukuri/Library/Caches/pip/wheels/57/bd/14/ce9e21f2649298678d011fb8f71ed38ee70b42b94fef0be142
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark import SparkFiles
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer,OneHotEncoder, IndexToString, VectorAssembler
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import FloatType
from pyspark.ml.evaluation import RegressionEvaluator

In [7]:
spark = SparkSession.builder.appName('Spark-Autos-Regression').getOrCreate()

24/01/28 14:04:59 WARN Utils: Your hostname, Sais-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.150.141.237 instead (on interface en0)
24/01/28 14:04:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/28 14:05:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Read Data

In [8]:
#get IRIS data from URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("imports-85.data"), header=False, inferSchema= True)
df = spark.createDataFrame(df.rdd, ["symboling","normalized-losses","make","fuel-type",
                                    "aspiration","num-of-doors","body-style","drive-wheels",
                                    "engine-location","wheel-base","length","width","height",
                                    "curb-weight","engine-type","num-of-cylinders","engine-size",
                                    "fuel-system","bore","stroke","compression-ratio","horsepower",
                                    "peak-rpm","city-mpg","highway-mpg","price"])

In [9]:
df.printSchema()

root
 |-- symboling: long (nullable = true)
 |-- normalized-losses: string (nullable = true)
 |-- make: string (nullable = true)
 |-- fuel-type: string (nullable = true)
 |-- aspiration: string (nullable = true)
 |-- num-of-doors: string (nullable = true)
 |-- body-style: string (nullable = true)
 |-- drive-wheels: string (nullable = true)
 |-- engine-location: string (nullable = true)
 |-- wheel-base: double (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- height: double (nullable = true)
 |-- curb-weight: long (nullable = true)
 |-- engine-type: string (nullable = true)
 |-- num-of-cylinders: string (nullable = true)
 |-- engine-size: long (nullable = true)
 |-- fuel-system: string (nullable = true)
 |-- bore: string (nullable = true)
 |-- stroke: string (nullable = true)
 |-- compression-ratio: double (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- peak-rpm: string (nullable = true)
 |-- city-mpg: long (nullable = tru

In [10]:
df = df.withColumn("price",df.price.cast(FloatType()))

In [14]:
df = df.dropna()

In [15]:
df.show()

24/01/28 14:10:27 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-------+
|symboling|normalized-losses|       make|fuel-type|aspiration|num-of-doors| body-style|drive-wheels|engine-location|wheel-base|length|width|height|curb-weight|engine-type|num-of-cylinders|engine-size|fuel-system|bore|stroke|compression-ratio|horsepower|peak-rpm|city-mpg|highway-mpg|  price|
+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-------+
|        3|                ?|alfa-romero|      gas|       std|         two|convertible|         rwd|          front|      88

### Feature Engineering

In [16]:
#converting relevant categorical features into one hot encoded 
indexer1 = StringIndexer(inputCol="make", outputCol="makeIdx").setHandleInvalid("skip")
indexer2 = StringIndexer(inputCol="fuel-type", outputCol="fueltypeIdx").setHandleInvalid("skip")
indexer3 = StringIndexer(inputCol="aspiration", outputCol="aspirationIdx").setHandleInvalid("skip")
indexer4 = StringIndexer(inputCol="body-style", outputCol="bodystyleIdx").setHandleInvalid("skip")



In [17]:
#gathering all indexers as inputs to the One Hot Encoder
inputs = [indexer1.getOutputCol(), indexer2.getOutputCol(), \
          indexer3.getOutputCol(), indexer4.getOutputCol()]

#creating the one hot encoder
encoder = OneHotEncoder(inputCols=inputs,  \
                                 outputCols=["makeVec", "fueltypeVec", \
                                             "aspirationVec", "bodystyleVec"])

In [18]:

#running it through a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer4, encoder])
df = pipeline.fit(df).transform(df)

In [19]:
#pipeline = pipeline.na.fill(0) 

#Already dropped all NAs so dont need to impute missing values.


df.show(5)

+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-------+-------+-----------+-------------+------------+---------------+-------------+-------------+-------------+
|symboling|normalized-losses|       make|fuel-type|aspiration|num-of-doors| body-style|drive-wheels|engine-location|wheel-base|length|width|height|curb-weight|engine-type|num-of-cylinders|engine-size|fuel-system|bore|stroke|compression-ratio|horsepower|peak-rpm|city-mpg|highway-mpg|  price|makeIdx|fueltypeIdx|aspirationIdx|bodystyleIdx|        makeVec|  fueltypeVec|aspirationVec| bodystyleVec|
+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+------

In [20]:
# Preparing the data by indexing the classes and putting the features into a vector.
vectorAssembler = VectorAssembler(inputCols=[ "length","width","height","compression-ratio",
                                              "makeVec", "fueltypeVec", "aspirationVec", "bodystyleVec"],
                                  outputCol="features")
df = vectorAssembler.transform(df)

In [21]:
df.show(5)

+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----------+----------------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-------+-------+-----------+-------------+------------+---------------+-------------+-------------+-------------+--------------------+
|symboling|normalized-losses|       make|fuel-type|aspiration|num-of-doors| body-style|drive-wheels|engine-location|wheel-base|length|width|height|curb-weight|engine-type|num-of-cylinders|engine-size|fuel-system|bore|stroke|compression-ratio|horsepower|peak-rpm|city-mpg|highway-mpg|  price|makeIdx|fueltypeIdx|aspirationIdx|bodystyleIdx|        makeVec|  fueltypeVec|aspirationVec| bodystyleVec|            features|
+---------+-----------------+-----------+---------+----------+------------+-----------+------------+---------------+----------+------+-----+------+-----------+-----

In [22]:
train_df, test_df =  df.randomSplit([0.7, 0.3],0.0)

In [23]:
train_df.select('features').show(5, truncate=False)

+----------------------------------------------------------------+
|features                                                        |
+----------------------------------------------------------------+
|(31,[0,1,2,3,12,25,26,27],[188.8,67.2,56.2,9.5,1.0,1.0,1.0,1.0])|
|(31,[0,1,2,3,12,25,27],[188.8,67.2,56.2,7.5,1.0,1.0,1.0])       |
|(31,[0,1,2,3,13,25,26,29],[174.6,64.6,59.8,8.5,1.0,1.0,1.0,1.0])|
|(31,[0,1,2,3,4,27],[175.6,66.5,54.9,22.5,1.0,1.0])              |
|(31,[0,1,2,3,4,25,26,28],[175.6,66.5,53.9,8.7,1.0,1.0,1.0,1.0]) |
+----------------------------------------------------------------+
only showing top 5 rows



### Training the Model

In [24]:
from pyspark.ml.regression import LinearRegression

#Elastic Net 
lr = LinearRegression(featuresCol = 'features', labelCol='price', regParam=0.3, elasticNetParam=0.8, maxIter=10)
lrm = lr.fit(train_df)

24/01/28 14:15:19 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [25]:

print("Coefficients: " + str(lrm.coefficients))
print("Intercept: " + str(lrm.intercept))

#model summary
print("RMSE: %f" % lrm.summary.rootMeanSquaredError)
print("r2: %f" % lrm.summary.r2)

Coefficients: [159.94472041137786,1318.954421643501,-259.7022250994983,-132.3122711416241,-1654.8298844422409,-866.9821577672034,-984.4362163302937,-260.7164068421108,-3360.9498225522707,-2108.5487763976985,-1277.8157846315396,-2358.9654313712954,916.2776833367021,-1958.3838763616945,9848.339405924824,10989.530014672586,-2244.2653033208667,-594.7322057567843,-224.59620735936306,24145.105456258363,2991.3166270032834,1177.6783706209985,13253.968470663876,-698.5041292463088,-5291.53666086014,-313.77071957011054,-1384.3791474640452,-1825.9970359524416,-1457.9548177574407,-2137.1526335798253,-0.0]
Intercept: -83412.02203824767
RMSE: 2298.135676
r2: 0.915649


### Predictions

In [26]:
predictions = lrm.transform(test_df)
predictions.select('features','price','prediction').show(5,truncate=False)

+----------------------------------------------------------------+-------+------------------+
|features                                                        |price  |prediction        |
+----------------------------------------------------------------+-------+------------------+
|(31,[0,1,2,3,12,25,26,27],[188.8,67.2,56.2,9.5,1.0,1.0,1.0,1.0])|12940.0|16959.177463776607|
|(31,[0,1,2,3,8,25,26,27],[172.4,65.4,51.6,7.5,1.0,1.0,1.0,1.0]) |9279.0 |9143.993361923684 |
|(31,[0,1,2,3,4,25,26,27],[175.6,66.5,54.9,8.7,1.0,1.0,1.0,1.0]) |8948.0 |11796.994200959685|
|(31,[0,1,2,3,12,25,26,27],[188.8,68.9,55.5,8.8,1.0,1.0,1.0,1.0])|21485.0|19475.810127939345|
|(31,[0,1,2,3,12,25,27],[188.8,68.8,55.5,8.7,1.0,1.0,1.0])       |19045.0|20741.52506035319 |
+----------------------------------------------------------------+-------+------------------+
only showing top 5 rows



### Model Evaluation

In [27]:
#print evaluation metrics
e = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = e.evaluate(predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = e.evaluate(predictions, {e.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = e.evaluate(predictions, {e.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = e.evaluate(predictions, {e.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 3597.465
MSE: 12941753.752
MAE: 2073.240
r2: 0.796
