In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=c4bc630b0f4da6483040a0aaf543de3e593c4da93d1605156953914ba7b76e80
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("linear_regression_model").getOrCreate()

In [None]:
real_estate = spark.read.option("inferSchema","true").csv("/content/Real estate.csv",header=True)

In [None]:
real_estate.printSchema()

root
 |-- No: integer (nullable = true)
 |-- X1 transaction date: double (nullable = true)
 |-- X2 house age: double (nullable = true)
 |-- X3 distance to the nearest MRT station: double (nullable = true)
 |-- X4 number of convenience stores: integer (nullable = true)
 |-- X5 latitude: double (nullable = true)
 |-- X6 longitude: double (nullable = true)
 |-- Y house price of unit area: double (nullable = true)



In [None]:
real_estate.show(2)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
only showing top 2 rows



In [None]:
real_estate.describe().show()

+-------+-----------------+-------------------+------------------+--------------------------------------+-------------------------------+--------------------+--------------------+--------------------------+
|summary|               No|X1 transaction date|      X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|         X5 latitude|        X6 longitude|Y house price of unit area|
+-------+-----------------+-------------------+------------------+--------------------------------------+-------------------------------+--------------------+--------------------+--------------------------+
|  count|              414|                414|               414|                                   414|                            414|                 414|                 414|                       414|
|   mean|            207.5| 2013.1489710144933| 17.71256038647343|                    1083.8856889130436|              4.094202898550725|  24.969030072463745|  121.53336108

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols =[
    "X1 transaction date",
    "X2 house age",
    "X3 distance to the nearest MRT station",
    "X4 number of convenience stores",
    "X5 latitude",
    "X6 longitude"],
    outputCol = "features")

In [None]:
data_set = assembler.transform(real_estate)
data_set.select(["features","Y house price of unit area"]).show(2)

+--------------------+--------------------------+
|            features|Y house price of unit area|
+--------------------+--------------------------+
|[2012.917,32.0,84...|                      37.9|
|[2012.917,19.5,30...|                      42.2|
+--------------------+--------------------------+
only showing top 2 rows



In [None]:
train_data,test_data = data_set.randomSplit([0.7,0.3])
train_data.show(truncate=False)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+------------------------------------------------+
|No |X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|features                                        |
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+------------------------------------------------+
|1  |2012.917           |32.0        |84.87882                              |10                             |24.98298   |121.54024   |37.9                      |[2012.917,32.0,84.87882,10.0,24.98298,121.54024]|
|3  |2013.583           |13.3        |561.9845                              |5                              |24.98746   |121.54391   |47.3                  

In [None]:
test_data.show(truncate=False)

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+-----------------------------------------------+
|No |X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|features                                       |
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+-----------------------------------------------+
|2  |2012.917           |19.5        |306.5947                              |9                              |24.98034   |121.53951   |42.2                      |[2012.917,19.5,306.5947,9.0,24.98034,121.53951]|
|5  |2012.833           |5.0         |390.5684                              |5                              |24.97937   |121.54245   |43.1                      

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol="Y house price of unit area")
lrmodel = lr.fit(train_data)

In [None]:
test_stats = lrmodel.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"R2: {test_stats.r2}")
print(f"MSE: {test_stats.meanSquaredError}")

RMSE: 7.917835894061269
R2: 0.6553717938761325
MSE: 62.69212524528501
