In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark=SparkSession.builder.appName("LinearRegression.com").getOrCreate()


24/09/27 09:47:50 WARN Utils: Your hostname, AI-CJB-LAP-459 resolves to a loopback address: 127.0.1.1; using 192.168.1.164 instead (on interface wlp0s20f3)
24/09/27 09:47:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/27 09:47:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


LOAD THE DATASET("BOSTON HOUSING")

In [8]:
url="https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"

spark.sparkContext.addFile(url)

boston_df= spark.read.csv(SparkFiles.get("BostonHousing.csv"),header=True, inferSchema=True)
boston_df.show(10,truncate=False)

24/09/27 10:10:28 WARN SparkContext: The path https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv has been added already. Overwriting of added paths is not supported in the current version.


+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|crim   |zn  |indus|chas|nox  |rm   |age  |dis   |rad|tax|ptratio|b     |lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0|2.31 |0   |0.538|6.575|65.2 |4.09  |1  |296|15.3   |396.9 |4.98 |24.0|
|0.02731|0.0 |7.07 |0   |0.469|6.421|78.9 |4.9671|2  |242|17.8   |396.9 |9.14 |21.6|
|0.02729|0.0 |7.07 |0   |0.469|7.185|61.1 |4.9671|2  |242|17.8   |392.83|4.03 |34.7|
|0.03237|0.0 |2.18 |0   |0.458|6.998|45.8 |6.0622|3  |222|18.7   |394.63|2.94 |33.4|
|0.06905|0.0 |2.18 |0   |0.458|7.147|54.2 |6.0622|3  |222|18.7   |396.9 |5.33 |36.2|
|0.02985|0.0 |2.18 |0   |0.458|6.43 |58.7 |6.0622|3  |222|18.7   |394.12|5.21 |28.7|
|0.08829|12.5|7.87 |0   |0.524|6.012|66.6 |5.5605|5  |311|15.2   |395.6 |12.43|22.9|
|0.14455|12.5|7.87 |0   |0.524|6.172|96.1 |5.9505|5  |311|15.2   |396.9 |19.15|27.1|
|0.21124|12.5|7.87 |0   |0.524|5.631|100.0|6.0821|5  |311|15.2   

PREPARE DATA

In [9]:
assembler = VectorAssembler(
    inputCols=["crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat"],
    outputCol="features")

boston_df = assembler.transform(boston_df)
final_data = boston_df.select("features", "medv")

train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

BUILD LINEAR REGRESSION MODEL

In [11]:
lr=LinearRegression(featuresCol="features",labelCol="medv",predictionCol="predicted_mdev")
lr_model=lr.fit(train_data)

24/09/27 10:13:21 WARN Instrumentation: [6e91d186] regParam is zero, which might cause numerical instability and overfitting.
24/09/27 10:13:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


MAKE PREDICTIONS AND EVALUATE THE MODEL

In [14]:
predictions=lr_model.transform(test_data)

evaluator=RegressionEvaluator(labelCol="medv",predictionCol="predicted_mdev",metricName='rmse')
rmse=evaluator.evaluate(predictions)
print("Root Mean Squared Error:",rmse)

evaluator2=RegressionEvaluator(labelCol="medv",predictionCol="predicted_mdev",metricName="r2")
r2=evaluator2.evaluate(predictions)
print("R-squared(R2) on test data :{:3f}".format(r2))

Root Mean Squared Error: 4.671806485171284
R-squared(R2) on test data :0.793152


INSPECT THE MODEL COEFFICIENTS AND INTERCEPT

In [15]:
coeff=lr_model.coefficients
intercepts=lr_model.intercept
print("Coefficients :",coeff)
print("Intercept :",intercepts)

Coefficients : [-0.11362203729408954,0.048909186934053925,0.02379542898673389,2.801771998735119,-18.4154245411894,3.5158797633120065,0.0052116821614709204,-1.4163830723539739,0.3317669315937035,-0.013607893704163878,-0.9534143338408072,0.008602677392853256,-0.519503531247664]
Intercept : 38.61699144573437


ANALYZE THE FEATURE IMPORTANCE

In [16]:
feature_importance = sorted(list(zip(boston_df.columns[:-1], map(abs, coeff))), key=lambda x: x[1], reverse=True)

print("Feature Importance:")
for feature, importance in feature_importance:
    print("  {}: {:.3f}".format(feature, importance))

Feature Importance:
  nox: 18.415
  rm: 3.516
  chas: 2.802
  dis: 1.416
  ptratio: 0.953
  lstat: 0.520
  rad: 0.332
  crim: 0.114
  zn: 0.049
  indus: 0.024
  tax: 0.014
  b: 0.009
  age: 0.005


SAVE AND LOAD THE MODEL

In [17]:
# Save the model
lr_model.save("lr_model")

# Load the model
from pyspark.ml.regression import LinearRegressionModel
loaded_model = LinearRegressionModel.load("lr_model")