In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark=SparkSession.builder.appName("Auto MPG Analysis").getOrCreate()

df=spark.read.csv("auto-mpg.csv",header=True,inferSchema=True)

In [40]:
import pandas as pd

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = df.toPandas()

# Display the Pandas DataFrame
pandas_df.head()  # You can use head() to display the first few rows, or print() for the entire DataFrame


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [41]:

filtered_df=df.filter(
    (col("cylinders")>=8)
    & (col("horsepower")>100)
)

In [42]:
pandas_df=filtered_df.toPandas()
print(len(pandas_df))
pandas_df.head()

102


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [43]:
ordered_df=filtered_df.orderBy(col("mpg").asc())

pandas_df=ordered_df.toPandas()
print(len(pandas_df))
pandas_df.head()

102


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,9.0,8,304.0,193,4732,18.5,70,1,hi 1200d
1,10.0,8,360.0,215,4615,14.0,70,1,ford f250
2,10.0,8,307.0,200,4376,15.0,70,1,chevy c20
3,11.0,8,318.0,210,4382,13.5,70,1,dodge d200
4,11.0,8,429.0,208,4633,11.0,72,1,mercury marquis


In [44]:
smallest_mpg=ordered_df.first()
print(smallest_mpg)

Row(mpg=9.0, cylinders=8, displacement=304.0, horsepower='193', weight=4732, acceleration=18.5, model year=70, origin=1, car name='hi 1200d')


In [None]:
reuslts=spark.sql("SELECT * FROM humans")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `humans` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [humans], [], false


In [None]:
# from pyspark.sql.functions import col

# # Remove rows with non-numeric 'horsepower' values or handle them as needed
# df = df.filter(df.horsepower.isNotNull() & (df.horsepower != '?'))

# # Convert the 'horsepower' column to numeric
# df = df.withColumn("horsepower", df["horsepower"].cast("double"))

# # Now use the VectorAssembler with a different output column name
# from pyspark.ml.feature import VectorAssembler

# assembler = VectorAssembler(
#     inputCols=["cylinders", "displacement", "horsepower"],
#     outputCol="features"
# )

# assembler_df = assembler.transform(df)


In [55]:

from pyspark.ml.feature import VectorAssembler 
  
assembler = VectorAssembler( 
    inputCols=["cylinders","displacement","weight"], 
    outputCol="features") 
output=assembler.transform(df)
output.select("features").show(5)

+------------------+
|          features|
+------------------+
|[8.0,307.0,3504.0]|
|[8.0,350.0,3693.0]|
|[8.0,318.0,3436.0]|
|[8.0,304.0,3433.0]|
|[8.0,302.0,3449.0]|
+------------------+
only showing top 5 rows



In [58]:
final_data = output.select("features",'mpg') 
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [59]:
train_data.describe().show() 
  
test_data.describe().show()

+-------+------------------+
|summary|               mpg|
+-------+------------------+
|  count|               275|
|   mean|23.351272727272722|
| stddev| 7.459777069168942|
|    min|              10.0|
|    max|              44.6|
+-------+------------------+

+-------+------------------+
|summary|               mpg|
+-------+------------------+
|  count|               123|
|   mean|23.879674796747967|
| stddev| 8.580008782807168|
|    min|               9.0|
|    max|              46.6|
+-------+------------------+



In [60]:
from pyspark.ml.regression import LinearRegression

# # Set labelCol to the actual label column name
lr = LinearRegression(labelCol="mpg")
model = lr.fit(train_data)

24/11/28 14:02:09 WARN Instrumentation: [211aa22c] regParam is zero, which might cause numerical instability and overfitting.
24/11/28 14:02:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [61]:
lr

LinearRegression_32b46148064c

In [64]:
# Print the coefficients and intercept for linear regression 
print("Coefficients: {}".format(model.coefficients)) 
print('Intercept: {}'.format(model.intercept))

Coefficients: [-0.21107478586980558,-0.009997335206186127,-0.005808811942261088]
Intercept: 43.6455110910076


In [66]:
test_results = model.evaluate(test_data) 
  
#Printing Residuals which is the difference between the actua 
#l value and the value predicted by the model (y-ŷ) for any given point 
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| -4.912531433953131|
|-2.2763412573062674|
|  7.283042492428539|
|0.31576082116944804|
|0.03605350200982471|
+-------------------+
only showing top 5 rows



In [67]:
unlabeled_data = test_data.select('features') 
  
predictions = model.transform(unlabeled_data) 
predictions.show(5)

+-----------------+------------------+
|         features|        prediction|
+-----------------+------------------+
|[3.0,80.0,2720.0]| 26.41253143395313|
|[4.0,68.0,1867.0]|31.276341257306267|
|[4.0,79.0,1755.0]|31.816957507571463|
|[4.0,79.0,1950.0]|30.684239178830552|
|[4.0,79.0,2074.0]|29.963946497990175|
+-----------------+------------------+
only showing top 5 rows



In [69]:
# Print the evaluation metrics
print("Root Mean Squared Error (RMSE): {}".format(test_results.rootMeanSquaredError))
print("Mean Absolute Error (MAE): {}".format(test_results.meanAbsoluteError))
print("R-squared (R²): {}".format(test_results.r2))


Root Mean Squared Error (RMSE): 4.578785753228642
Mean Absolute Error (MAE): 3.3261348091924456
R-squared (R²): 0.7128753613392251
