In [0]:
import pandas as pd
import pyspark

In [0]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.master('local[3]'). appName('YBIF').getOrCreate()

In [0]:
spark

In [0]:
df= pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Boston.csv')


In [0]:
house= spark.createDataFrame(df)

In [0]:
house.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222.0|   18.7|394.12| 5.21|28.7|
|           0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311.0|   15.2| 395.6|12.43|22.9|
|           0.14455|

In [0]:
house.columns

Out[31]: ['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'MEDV']

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
featureassembler= VectorAssembler(inputCols=['CRIM','ZN', 'INDUS','CHAS', 'NX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], outputCol= 'Features')

In [0]:
output= featureassembler.transform(house)

In [0]:
output.show()

+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|              CRIM|  ZN|INDUS|CHAS|   NX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|MEDV|            Features|
+------------------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+----+--------------------+
|           0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296.0|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|           0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242.0|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|           0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242.0|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.0323699999999999| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222.0|   18.7|394.63| 2.94|33.4|[0.03236999999999...|
|           0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222.0|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|           0.02985| 0.0| 2.18|   0|0.458| 6.43|

In [0]:
modeldata= output.select('Features', 'MEDV')

In [0]:
modeldata.show()

+--------------------+----+
|            Features|MEDV|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03236999999999...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796000000000...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [0]:
# Step 4
train_data, test_data= modeldata.randomSplit([0.8,0.2])

In [0]:
train_data.show()

+--------------------+----+
|            Features|MEDV|
+--------------------+----+
|[0.01311,90.0,1.2...|35.4|
|[0.0136,75.0,4.0,...|18.9|
|[0.01432,100.0,1....|31.6|
|[0.02055,85.0,0.7...|24.7|
|[0.02731,0.0,7.07...|21.6|
|[0.02763,75.0,2.9...|30.8|
|[0.02985,0.0,2.18...|28.7|
|[0.03236999999999...|33.4|
|[0.03358999999999...|34.9|
|[0.04337,21.0,5.6...|20.5|
|[0.04981,21.0,5.6...|23.4|
|[0.0536,21.0,5.64...|25.0|
|[0.06417,0.0,5.96...|18.9|
|[0.06905,0.0,2.18...|36.2|
|[0.08014,0.0,5.96...|21.0|
|[0.08829,12.5,7.8...|22.9|
|[0.08872999999999...|19.7|
|[0.09378,12.5,7.8...|21.7|
|[0.09744,0.0,5.96...|20.0|
|[0.12269,0.0,6.91...|21.2|
+--------------------+----+
only showing top 20 rows



In [0]:
# Step 5: select model

from pyspark.ml.regression import LinearRegression

In [0]:
reg= LinearRegression(featuresCol='Features', labelCol='MEDV')

In [0]:
reg= reg.fit(train_data)

In [0]:
reg.coefficients

Out[43]: DenseVector([-0.1245, 0.0369, 0.029, 1.2021, -16.3852, 4.3897, -0.0093, -1.3746, 0.2906, -0.0144, -0.9412, 0.0098, -0.406])

In [0]:
reg.intercept

Out[44]: 31.51852010645979

In [0]:
spark.stop()