In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import when
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

In [2]:
spark=SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/02 20:40:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("./auto-mpg.csv", inferSchema=True, header=True)

                                                                                

In [4]:
df.show()

+----+---------+------------+----------+------+------------+----------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|        70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|        70|     1|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|        70|     1|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|        70|     1|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|        70|     1|         ford torino|
|15.0|        8|       429.0|       198|  4341|        10.0|        70|     1|    ford galaxie 500|
|14.0|        8|       454.0|       220|  4354|         9.0|        70|     1|    chevrolet impala|


In [5]:
df.printSchema()

root
 |-- mpg: double (nullable = true)
 |-- cylinders: integer (nullable = true)
 |-- displacement: double (nullable = true)
 |-- horsepower: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- model year: integer (nullable = true)
 |-- origin: integer (nullable = true)
 |-- car name: string (nullable = true)



In [6]:
df.dtypes

[('mpg', 'double'),
 ('cylinders', 'int'),
 ('displacement', 'double'),
 ('horsepower', 'string'),
 ('weight', 'int'),
 ('acceleration', 'double'),
 ('model year', 'int'),
 ('origin', 'int'),
 ('car name', 'string')]

In [7]:
# Change data type of horsepower from string to int
df=df.withColumn("horsepower",col("horsepower").cast("int"))

In [8]:
df.dtypes

[('mpg', 'double'),
 ('cylinders', 'int'),
 ('displacement', 'double'),
 ('horsepower', 'int'),
 ('weight', 'int'),
 ('acceleration', 'double'),
 ('model year', 'int'),
 ('origin', 'int'),
 ('car name', 'string')]

In [9]:
df.describe().show()

24/12/02 20:40:30 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:>                                                          (0 + 1) / 1]

+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+
|summary|               mpg|         cylinders|      displacement|        horsepower|           weight|      acceleration|        model year|            origin|            car name|
+-------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+--------------------+
|  count|               398|               398|               398|               392|              398|               398|               398|               398|                 398|
|   mean|23.514572864321615| 5.454773869346734|193.42587939698493|104.46938775510205|2970.424623115578|15.568090452261291| 76.01005025125629|1.5728643216080402|                NULL|
| stddev| 7.815984312565783|1.7010042445332123|104.26983817119587| 38.49115993282846|846.8

                                                                                

In [10]:
print(f'There are total {df.count()} row. Let print first 2 data rows:')
df.limit(2).show()

There are total 398 row. Let print first 2 data rows:
+----+---------+------------+----------+------+------------+----------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|        70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|        70|     1|   buick skylark 320|
+----+---------+------------+----------+------+------------+----------+------+--------------------+



In [34]:
from pyspark.sql.functions import col, sum as _sum
null_counts = df.select(
    [
        _sum(col(c).isNull().cast("int")).alias(c)  # Count nulls for each column
        for c in df.columns
    ]
)
null_counts.show()

+---+---------+------------+----------+------+------------+----------+------+--------+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|car name|
+---+---------+------------+----------+------+------------+----------+------+--------+
|  0|        0|           0|         6|     0|           0|         0|     0|       0|
+---+---------+------------+----------+------+------------+----------+------+--------+



In [36]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=["horsepower"],  # Column(s) to fill
    outputCols=["horsepower"],  # Column(s) to store filled values
    strategy="mean"  # Strategy: 'mean' or 'median'
)

df_imputed=imputer.fit(df).transform(df)
df_imputed.show()

+----+---------+------------+----------+------+------------+----------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|        70|     1|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|        70|     1|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|        70|     1|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|        70|     1|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|        70|     1|         ford torino|
|15.0|        8|       429.0|       198|  4341|        10.0|        70|     1|    ford galaxie 500|
|14.0|        8|       454.0|       220|  4354|         9.0|        70|     1|    chevrolet impala|


In [37]:
from pyspark.sql.functions import col, sum as _sum
null_counts = df_imputed.select(
    [
        _sum(col(c).isNull().cast("int")).alias(c)  # Count nulls for each column
        for c in df.columns
    ]
)
null_counts.show()

+---+---------+------------+----------+------+------------+----------+------+--------+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|car name|
+---+---------+------------+----------+------+------------+----------+------+--------+
|  0|        0|           0|         0|     0|           0|         0|     0|       0|
+---+---------+------------+----------+------+------------+----------+------+--------+



The origin column in the Auto MPG dataset represents the region where the car was manufactured. The values in this column are coded as follows:

1: United States

2: Europe

3: Japan

In [12]:
# Change the numeric origin values to corresponding region names
df_with_region = df.withColumn("origin", when(df["origin"] == 1, "United States")
                   .when(df["origin"] == 2, "Europe")
                   .when(df["origin"] == 3, "Japan")
                   .otherwise("Unknown"))
df_with_region.show()

+----+---------+------------+----------+------+------------+----------+-------------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model year|       origin|            car name|
+----+---------+------------+----------+------+------------+----------+-------------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|        70|United States|chevrolet chevell...|
|15.0|        8|       350.0|       165|  3693|        11.5|        70|United States|   buick skylark 320|
|18.0|        8|       318.0|       150|  3436|        11.0|        70|United States|  plymouth satellite|
|16.0|        8|       304.0|       150|  3433|        12.0|        70|United States|       amc rebel sst|
|17.0|        8|       302.0|       140|  3449|        10.5|        70|United States|         ford torino|
|15.0|        8|       429.0|       198|  4341|        10.0|        70|United States|    ford galaxie 500|
|14.0|        8|       454.0|       2

In [13]:
df_grouped=df_with_region.groupBy('origin').count()
df_grouped.show()

+-------------+-----+
|       origin|count|
+-------------+-----+
|       Europe|   70|
|United States|  249|
|        Japan|   79|
+-------------+-----+



StringIndexer: It is used to convert a string column into numerical form. It allocates unique values to each of the categories present in the respective column.

In [14]:
# from pyspark.ml.feature import StringIndexer
# indexer=StringIndexer(inputCol='origin',outputCol="origin_cat")
# indexed=indexer.fit(df).transform(df)

# Feature Engineering
Given the multiple columns, we need to merge them into a single column using VectorAssembler. It is a feature transformer that merges multiple columns into a vector column. One can select the number of columns used as input features and pass only those columns through the VectorAssembler. We will pass all seven input columns to create a single feature vector column in our case.

In [15]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [16]:
assembler=VectorAssembler(inputCols=["cylinders","displacement","horsepower","weight","acceleration","model year","origin"],outputCol='features')

VectorAssembler: It is a transformer that helps to select the input columns that we need to create a single feature vector to train our Machine Learning models

In [17]:
assembler

VectorAssembler_65dba010c770

In [38]:
df_transformed=assembler.transform(df_imputed)

In [39]:
df_transformed.show()

+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model year|origin|            car name|            features|
+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
|18.0|        8|       307.0|       130|  3504|        12.0|        70|     1|chevrolet chevell...|[8.0,307.0,130.0,...|
|15.0|        8|       350.0|       165|  3693|        11.5|        70|     1|   buick skylark 320|[8.0,350.0,165.0,...|
|18.0|        8|       318.0|       150|  3436|        11.0|        70|     1|  plymouth satellite|[8.0,318.0,150.0,...|
|16.0|        8|       304.0|       150|  3433|        12.0|        70|     1|       amc rebel sst|[8.0,304.0,150.0,...|
|17.0|        8|       302.0|       140|  3449|        10.5|        70|     1|         ford torino|[8.0,302.0,140.0,...|
|15.0|        8|       429.0|   

Now that we have our features ready and the input variables are known, let’s get them together and dive into the machine-learning technique.

In [40]:
df_transformed.select('features','mpg').show(5)

+--------------------+----+
|            features| mpg|
+--------------------+----+
|[8.0,307.0,130.0,...|18.0|
|[8.0,350.0,165.0,...|15.0|
|[8.0,318.0,150.0,...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,302.0,140.0,...|17.0|
+--------------------+----+
only showing top 5 rows



In [41]:
df.dtypes

[('mpg', 'double'),
 ('cylinders', 'int'),
 ('displacement', 'double'),
 ('horsepower', 'int'),
 ('weight', 'int'),
 ('acceleration', 'double'),
 ('model year', 'int'),
 ('origin', 'int'),
 ('car name', 'string')]

# Splitting the Dataset
Splitting the Data into Train and Test sets to train our model and check its efficiency

In [42]:
final_data=df_transformed.select('features','mpg')

In [43]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [44]:
train_data.describe().show()

+-------+-----------------+
|summary|              mpg|
+-------+-----------------+
|  count|              280|
|   mean|23.64821428571428|
| stddev|7.946216860317434|
|    min|              9.0|
|    max|             44.6|
+-------+-----------------+



In [45]:
test_data.describe().show()

+-------+-----------------+
|summary|              mpg|
+-------+-----------------+
|  count|              118|
|   mean|23.19745762711864|
| stddev|7.521473099311019|
|    min|             11.0|
|    max|             46.6|
+-------+-----------------+



#  PySpark Linear Regression

In [47]:
from pyspark.ml.regression import LinearRegression

In [48]:
lr=LinearRegression(featuresCol="features",labelCol="mpg")

In [49]:
trained_lr_model=lr.fit(train_data)

24/12/02 20:50:20 WARN Instrumentation: [a4981c2e] regParam is zero, which might cause numerical instability and overfitting.
24/12/02 20:50:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [50]:
results=trained_lr_model.evaluate(train_data)

In [53]:
# Print the evaluation metrics
print("Root Mean Squared Error (RMSE): {}".format(results.rootMeanSquaredError))
print("Mean Absolute Error (MAE): {}".format(results.meanAbsoluteError))
print("R-squared (R²): {}".format(results.r2))

Root Mean Squared Error (RMSE): 3.2645489385835766
Mean Absolute Error (MAE): 2.512042869379488
R-squared (R²): 0.8306132760518332


# Predictions from the Model

In [54]:
unlabeled_data=test_data.select("features")
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[3.0,70.0,97.0,23...|
|[4.0,68.0,49.0,18...|
|[4.0,71.0,65.0,17...|
|[4.0,79.0,67.0,20...|
|[4.0,86.0,65.0,21...|
+--------------------+
only showing top 5 rows



# PySpark Linear Regression Predict

In [55]:
predictions=trained_lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[3.0,70.0,97.0,23...|24.813987240503703|
|[4.0,68.0,49.0,18...|28.044651639568695|
|[4.0,71.0,65.0,17...|28.324082058655737|
|[4.0,79.0,67.0,20...|27.741228281729253|
|[4.0,86.0,65.0,21...|33.230975197964355|
|[4.0,89.0,60.0,19...|34.364272562826116|
|[4.0,89.0,62.0,20...|34.456368033017455|
|[4.0,90.0,48.0,19...|  31.6859917646316|
|[4.0,90.0,48.0,20...|32.579183661818604|
|[4.0,90.0,70.0,19...|28.987999072778198|
|[4.0,90.0,71.0,22...|27.174600887440025|
|[4.0,90.0,75.0,21...| 27.09641938564891|
|[4.0,91.0,53.0,17...|31.595699222859576|
|[4.0,91.0,60.0,18...| 33.81206334420234|
|[4.0,91.0,68.0,20...|    35.49558969876|
|[4.0,91.0,70.0,19...|24.963169266198886|
|[4.0,97.0,60.0,18...|27.095489383720345|
|[4.0,97.0,75.0,21...|28.856434100266423|
|[4.0,97.0,88.0,21...|26.968673321486055|
|[4.0,97.0,92.0,22...|25.674879010442414|
+--------------------+------------

In [58]:
test_results=trained_lr_model.evaluate(test_data)

In [60]:
# Print the evaluation metrics
print("Test data Root Mean Squared Error (RMSE): {}".format(test_results.rootMeanSquaredError))
print("Test data Mean Absolute Error (MAE): {}".format(test_results.meanAbsoluteError))
print("Test data R-squared (R²): {}".format(test_results.r2))

Test data Root Mean Squared Error (RMSE): 3.4541729787396753
Test data Mean Absolute Error (MAE): 2.4939016647873093
Test data R-squared (R²): 0.7872946089440453
