In [0]:
from pyspark.sql import SparkSession

In [0]:
sc=SparkSession.builder.appName("ML").getOrCreate()

In [0]:
data=sc.read.csv('/FileStore/tables/combined_cycle_power_plant.csv',inferSchema=True,header=True,sep=';')

In [0]:
data.show(5)

+-----------+--------------+----------------+-----------------+-------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|energy_output|
+-----------+--------------+----------------+-----------------+-------------+
|       9.59|         38.56|         1017.01|             60.1|        481.3|
|      12.04|         42.34|         1019.72|            94.67|       465.36|
|      13.87|         45.08|         1024.42|            81.69|       465.48|
|      13.72|          54.3|         1017.89|            79.08|       467.05|
|      15.14|         49.64|         1023.78|             75.0|       463.58|
+-----------+--------------+----------------+-----------------+-------------+
only showing top 5 rows



In [0]:
from pyspark.sql.functions import corr

In [0]:
data.select(corr('temperature','energy_output')).show()

+--------------------------------+
|corr(temperature, energy_output)|
+--------------------------------+
|             -0.9481284704167571|
+--------------------------------+



In [0]:
data.select(corr('exhaust_vacuum','energy_output')).show()

+-----------------------------------+
|corr(exhaust_vacuum, energy_output)|
+-----------------------------------+
|                -0.8697803096577876|
+-----------------------------------+



In [0]:
data.select(corr('relative_humidity','energy_output',)).show()

+--------------------------------------+
|corr(relative_humidity, energy_output)|
+--------------------------------------+
|                    0.3897940997901016|
+--------------------------------------+



In [0]:
data.select(corr('ambient_pressure','energy_output',)).show()

+-------------------------------------+
|corr(ambient_pressure, energy_output)|
+-------------------------------------+
|                    0.518429027361572|
+-------------------------------------+



In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

Out[15]: ['temperature',
 'exhaust_vacuum',
 'ambient_pressure',
 'relative_humidity',
 'energy_output']

In [0]:
ass=VectorAssembler(inputCols=['temperature',
 'exhaust_vacuum',
 'ambient_pressure',
 'relative_humidity'],outputCol='feature')

In [0]:
finaldata=ass.transform(data)

In [0]:
finaldata.show(5)

+-----------+--------------+----------------+-----------------+-------------+--------------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|energy_output|             feature|
+-----------+--------------+----------------+-----------------+-------------+--------------------+
|       9.59|         38.56|         1017.01|             60.1|        481.3|[9.59,38.56,1017....|
|      12.04|         42.34|         1019.72|            94.67|       465.36|[12.04,42.34,1019...|
|      13.87|         45.08|         1024.42|            81.69|       465.48|[13.87,45.08,1024...|
|      13.72|          54.3|         1017.89|            79.08|       467.05|[13.72,54.3,1017....|
|      15.14|         49.64|         1023.78|             75.0|       463.58|[15.14,49.64,1023...|
+-----------+--------------+----------------+-----------------+-------------+--------------------+
only showing top 5 rows



In [0]:
finaldata=finaldata.select('feature','energy_output')

In [0]:
finaldata.show(5)

+--------------------+-------------+
|             feature|energy_output|
+--------------------+-------------+
|[9.59,38.56,1017....|        481.3|
|[12.04,42.34,1019...|       465.36|
|[13.87,45.08,1024...|       465.48|
|[13.72,54.3,1017....|       467.05|
|[15.14,49.64,1023...|       463.58|
+--------------------+-------------+
only showing top 5 rows



In [0]:
train_data,test_data=finaldata.randomSplit([0.7,0.3])

In [0]:
finaldata.count()

Out[23]: 9568

In [0]:
train_data.count()

Out[24]: 6726

In [0]:
test_data.count()

Out[25]: 2842

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
algo=LinearRegression(featuresCol='feature',labelCol='energy_output')

In [0]:
model=algo.fit(train_data)

In [0]:
model.coefficients

Out[32]: DenseVector([-1.9808, -0.2362, 0.0606, -0.1552])

In [0]:
model.intercept

Out[33]: 456.08391273741415

In [0]:
ypred=model.transform(test_data)

In [0]:
ypred.show(10)

+--------------------+-------------+------------------+
|             feature|energy_output|        prediction|
+--------------------+-------------+------------------+
|[2.34,39.42,1028....|       490.34| 493.6547389494764|
|[2.58,39.42,1028....|       488.69| 493.2929788071793|
|[2.64,39.64,1011....|       481.29| 489.5351671307614|
|[2.71,39.42,1026....|        489.3|491.03761979901026|
|[3.21,38.44,1016....|       491.35| 488.8752033861091|
|[3.26,41.31,996.3...|       489.38|484.73005118534303|
|[3.51,35.47,1017....|       489.07|488.98663257454524|
|[3.63,38.44,1016....|       487.87|487.83694029363596|
|[3.74,35.19,1018....|        490.5| 486.7543645807982|
|[3.85,35.47,1016....|       489.78| 488.4617520387491|
+--------------------+-------------+------------------+
only showing top 10 rows



In [0]:
test_data.show(5)

+--------------------+-------------+
|             feature|energy_output|
+--------------------+-------------+
|[2.34,39.42,1028....|       490.34|
|[2.58,39.42,1028....|       488.69|
|[2.64,39.64,1011....|       481.29|
|[2.71,39.42,1026....|        489.3|
|[3.21,38.44,1016....|       491.35|
+--------------------+-------------+
only showing top 5 rows



In [0]:
test=test_data.select('feature')

In [0]:
test.show(5)

+--------------------+
|             feature|
+--------------------+
|[2.34,39.42,1028....|
|[2.58,39.42,1028....|
|[2.64,39.64,1011....|
|[2.71,39.42,1026....|
|[3.21,38.44,1016....|
+--------------------+
only showing top 5 rows



In [0]:
model.transform(test).show()

+--------------------+------------------+
|             feature|        prediction|
+--------------------+------------------+
|[2.34,39.42,1028....| 493.6547389494764|
|[2.58,39.42,1028....| 493.2929788071793|
|[2.64,39.64,1011....| 489.5351671307614|
|[2.71,39.42,1026....|491.03761979901026|
|[3.21,38.44,1016....| 488.8752033861091|
|[3.26,41.31,996.3...|484.73005118534303|
|[3.51,35.47,1017....|488.98663257454524|
|[3.63,38.44,1016....|487.83694029363596|
|[3.74,35.19,1018....| 486.7543645807982|
|[3.85,35.47,1016....| 488.4617520387491|
|[3.92,41.31,999.2...|484.33435449525655|
|[3.96,35.47,1016....| 488.4773406540468|
|[3.99,39.64,1011....| 487.1496640374085|
|[4.0,39.9,1009.64...| 484.8455864018986|
|[4.47,35.19,1018....| 486.2768141888738|
|[4.56,40.27,1011....|486.36604192090766|
|[4.65,35.19,1018....| 485.5609032708584|
|[4.7,39.9,1007.45...| 484.2794958279254|
|[4.81,42.85,1013....|484.39267488070027|
|[4.84,38.5,1011.9...|486.06562541967565|
+--------------------+------------

In [0]:
test2=[[15,78,1585,795]]
test2=sc.createDataFrame(test2,schema=['temperature',
 'exhaust_vacuum',
 'ambient_pressure',
 'relative_humidity'])

In [0]:
test2=ass.transform(test2)

In [0]:
model.transform(test2).show()

+-----------+--------------+----------------+-----------------+--------------------+-----------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|             feature|       prediction|
+-----------+--------------+----------------+-----------------+--------------------+-----------------+
|         15|            78|            1585|              795|[15.0,78.0,1585.0...|380.5891300335195|
+-----------+--------------+----------------+-----------------+--------------------+-----------------+

