In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 58.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=03585554dce9ab578f51f3c5c2b201a06b616109890bef523ff23f14b09ea70e
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
from pyspark.sql import SparkSession

In [4]:
session = SparkSession.builder.appName("multiclass").master("local").getOrCreate()
data = session.read.csv("energy_cooling.csv", header = True, inferSchema = True)

In [5]:
data.show()

+----+-----+-----+------+---+---+---+---+-----+
|  X1|   X2|   X3|    X4| X5| X6| X7| X8|   Y2|
+----+-----+-----+------+---+---+---+---+-----+
|0.98|514.5|294.0|110.25|7.0|  2|0.0|  0|21.33|
|0.98|514.5|294.0|110.25|7.0|  3|0.0|  0|21.33|
|0.98|514.5|294.0|110.25|7.0|  4|0.0|  0|21.33|
|0.98|514.5|294.0|110.25|7.0|  5|0.0|  0|21.33|
| 0.9|563.5|318.5| 122.5|7.0|  2|0.0|  0|28.28|
| 0.9|563.5|318.5| 122.5|7.0|  3|0.0|  0|25.38|
| 0.9|563.5|318.5| 122.5|7.0|  4|0.0|  0|25.16|
| 0.9|563.5|318.5| 122.5|7.0|  5|0.0|  0| 29.6|
|0.86|588.0|294.0| 147.0|7.0|  2|0.0|  0| 27.3|
|0.86|588.0|294.0| 147.0|7.0|  3|0.0|  0|21.97|
|0.86|588.0|294.0| 147.0|7.0|  4|0.0|  0|23.49|
|0.86|588.0|294.0| 147.0|7.0|  5|0.0|  0|27.87|
|0.82|612.5|318.5| 147.0|7.0|  2|0.0|  0|23.77|
|0.82|612.5|318.5| 147.0|7.0|  3|0.0|  0|21.46|
|0.82|612.5|318.5| 147.0|7.0|  4|0.0|  0|21.16|
|0.82|612.5|318.5| 147.0|7.0|  5|0.0|  0|24.93|
|0.79|637.0|343.0| 147.0|7.0|  2|0.0|  0|37.73|
|0.79|637.0|343.0| 147.0|7.0|  3|0.0|  0

In [6]:
data.describe()

DataFrame[summary: string, X1: string, X2: string, X3: string, X4: string, X5: string, X6: string, X7: string, X8: string, Y2: string]

In [7]:
data.columns

['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'Y2']

In [9]:
data.dtypes

[('X1', 'double'),
 ('X2', 'double'),
 ('X3', 'double'),
 ('X4', 'double'),
 ('X5', 'double'),
 ('X6', 'int'),
 ('X7', 'double'),
 ('X8', 'int'),
 ('Y2', 'double')]

In [10]:
data.printSchema()

root
 |-- X1: double (nullable = true)
 |-- X2: double (nullable = true)
 |-- X3: double (nullable = true)
 |-- X4: double (nullable = true)
 |-- X5: double (nullable = true)
 |-- X6: integer (nullable = true)
 |-- X7: double (nullable = true)
 |-- X8: integer (nullable = true)
 |-- Y2: double (nullable = true)



In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
vec_ass = VectorAssembler(inputCols = ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], outputCol = "allfeatures")

# Linear Regression

In [15]:
from pyspark.ml.regression import LinearRegression
model = LinearRegression(featuresCol = "allfeatures", labelCol="Y2")

In [16]:
from pyspark.ml import Pipeline

In [17]:
mypipeline = Pipeline(stages = [vec_ass, model])

In [18]:
training, test = data.randomSplit([.8,.2])

In [19]:
model1 = mypipeline.fit(training)

In [20]:
result = model1.transform(test)
result.show(2)

+----+-----+-----+-----+---+---+---+---+-----+--------------------+------------------+
|  X1|   X2|   X3|   X4| X5| X6| X7| X8|   Y2|         allfeatures|        prediction|
+----+-----+-----+-----+---+---+---+---+-----+--------------------+------------------+
|0.62|808.5|367.5|220.5|3.5|  2|0.1|  1|14.34|[0.62,808.5,367.5...|15.452197410952223|
|0.62|808.5|367.5|220.5|3.5|  2|0.1|  5|14.24|[0.62,808.5,367.5...|15.567381596829804|
+----+-----+-----+-----+---+---+---+---+-----+--------------------+------------------+
only showing top 2 rows



In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2")
eval.evaluate(result)

2.890978875110251

In [23]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2", metricName="rmse")
eval.evaluate(result)

2.890978875110251

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2", metricName = "mae")
eval.evaluate(result)

2.118836785750091

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2", metricName = "r2")
eval.evaluate(result)

0.9095707612732573

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2", metricName = "mse")
eval.evaluate(result)

8.357758856333732

# Decision Tree Regressor

In [41]:
from pyspark.ml.regression import DecisionTreeRegressor
tree = DecisionTreeRegressor(featuresCol = "allfeatures", labelCol="Y2")

In [42]:
from pyspark.ml import Pipeline

In [43]:
mypipeline1 = Pipeline(stages = [vec_ass, tree])

In [44]:
training, test = data.randomSplit([.8,.2])

In [45]:
model2 = mypipeline1.fit(training)

In [46]:
result1 = model2.transform(test)
result1.show(2)

+----+-----+-----+-----+---+---+----+---+-----+--------------------+------------------+
|  X1|   X2|   X3|   X4| X5| X6|  X7| X8|   Y2|         allfeatures|        prediction|
+----+-----+-----+-----+---+---+----+---+-----+--------------------+------------------+
|0.62|808.5|367.5|220.5|3.5|  2| 0.1|  2|14.37|[0.62,808.5,367.5...|14.268750000000002|
|0.62|808.5|367.5|220.5|3.5|  2|0.25|  2|15.32|[0.62,808.5,367.5...|15.370714285714286|
+----+-----+-----+-----+---+---+----+---+-----+--------------------+------------------+
only showing top 2 rows



In [47]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2")
eval.evaluate(result1)

2.1093686213366194

In [48]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(predictionCol="prediction", labelCol="Y2", metricName = "r2")
eval.evaluate(result1)

0.9555068667755069