In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

In [0]:

# Read the data into a Spark DataFrame
df = spark.read.csv("dbfs:/FileStore/tables/Housing.txt", header=True, inferSchema=True)

In [0]:
df.show()

+-------+-------+--------+-------+-------+--------+-------+--------+-----+-----+--------+--------+
|  price|lotsize|bedrooms|bathrms|stories|driveway|recroom|fullbase|gashw|airco|garagepl|prefarea|
+-------+-------+--------+-------+-------+--------+-------+--------+-----+-----+--------+--------+
|42000.0|   5850|       3|      1|      2|     yes|     no|     yes|   no|   no|       1|      no|
|38500.0|   4000|       2|      1|      1|     yes|     no|      no|   no|   no|       0|      no|
|49500.0|   3060|       3|      1|      1|     yes|     no|      no|   no|   no|       0|      no|
|60500.0|   6650|       3|      1|      2|     yes|    yes|      no|   no|   no|       0|      no|
|61000.0|   6360|       2|      1|      1|     yes|     no|      no|   no|   no|       0|      no|
|66000.0|   4160|       3|      1|      1|     yes|    yes|     yes|   no|  yes|       0|      no|
|66000.0|   3880|       3|      2|      2|     yes|     no|     yes|   no|   no|       2|      no|
|69000.0| 

In [0]:

# Select the features and label
data = df.select(['lotsize']+['price'])

In [0]:
data.show()

+-------+-------+
|lotsize|  price|
+-------+-------+
|   5850|42000.0|
|   4000|38500.0|
|   3060|49500.0|
|   6650|60500.0|
|   6360|61000.0|
|   4160|66000.0|
|   3880|66000.0|
|   4160|69000.0|
|   4800|83800.0|
|   5500|88500.0|
|   7200|90000.0|
|   3000|30500.0|
|   1700|27000.0|
|   2880|36000.0|
|   3600|37000.0|
|   3185|37900.0|
|   3300|40500.0|
|   5200|40750.0|
|   3450|45000.0|
|   3986|45000.0|
+-------+-------+
only showing top 20 rows



In [0]:
# Assemble the features into a single column
assembler = VectorAssembler(inputCols=["lotsize"], outputCol="features")
data = assembler.transform(data)

In [0]:
data.show()

+-------+-------+--------+
|lotsize|  price|features|
+-------+-------+--------+
|   5850|42000.0|[5850.0]|
|   4000|38500.0|[4000.0]|
|   3060|49500.0|[3060.0]|
|   6650|60500.0|[6650.0]|
|   6360|61000.0|[6360.0]|
|   4160|66000.0|[4160.0]|
|   3880|66000.0|[3880.0]|
|   4160|69000.0|[4160.0]|
|   4800|83800.0|[4800.0]|
|   5500|88500.0|[5500.0]|
|   7200|90000.0|[7200.0]|
|   3000|30500.0|[3000.0]|
|   1700|27000.0|[1700.0]|
|   2880|36000.0|[2880.0]|
|   3600|37000.0|[3600.0]|
|   3185|37900.0|[3185.0]|
|   3300|40500.0|[3300.0]|
|   5200|40750.0|[5200.0]|
|   3450|45000.0|[3450.0]|
|   3986|45000.0|[3986.0]|
+-------+-------+--------+
only showing top 20 rows



In [0]:
# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [0]:
# Create a Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol='price')

In [0]:

# Fit the model to the training data
lr_model = lr.fit(train_data)

In [0]:

# Make predictions on the test data
predictions = lr_model.transform(test_data)

In [0]:
predictions.show()

+-------+-------+--------+------------------+
|lotsize|  price|features|        prediction|
+-------+-------+--------+------------------+
|   1836|32500.0|[1836.0]|47823.993798690906|
|   2000|38000.0|[2000.0]| 48819.86765921677|
|   2135|50000.0|[2135.0]| 49639.64187367404|
|   2145|47000.0|[2145.0]| 49700.36588955976|
|   2145|56000.0|[2145.0]| 49700.36588955976|
|   2145|60000.0|[2145.0]| 49700.36588955976|
|   2160|44000.0|[2160.0]|49791.451913388344|
|   2325|60000.0|[2325.0]|50793.398175502785|
|   2400|25245.0|[2400.0]| 51248.82829464571|
|   2430|38000.0|[2430.0]| 51431.00034230288|
|   2475|34000.0|[2475.0]| 51704.25841378864|
|   2610|49000.0|[2610.0]| 52524.03262824591|
|   2610|60000.0|[2610.0]| 52524.03262824591|
|   2640|40500.0|[2640.0]| 52706.20467590308|
|   2684|46000.0|[2684.0]| 52973.39034580026|
|   2700|47900.0|[2700.0]| 53070.54877121742|
|   2747|60000.0|[2747.0]| 53355.95164588032|
|   2800|70800.0|[2800.0]|53677.788930074654|
|   2850|52000.0|[2850.0]| 53981.4

In [0]:
# Evaluate the model
evaluator = RegressionEvaluator(labelCol='price', predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R2 Score on test data: {:.2f}".format(r2))


R2 Score on test data: 0.32
