In [0]:
from pyspark.sql import SparkSession

In [0]:
sc=SparkSession.builder.appName("ML").getOrCreate()

In [0]:
data=sc.read.csv('/FileStore/tables/combined_cycle_power_plant.csv',inferSchema=True,header=True,sep=';')

In [0]:
data.show()

+-----------+--------------+----------------+-----------------+-------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|energy_output|
+-----------+--------------+----------------+-----------------+-------------+
|       9.59|         38.56|         1017.01|             60.1|        481.3|
|      12.04|         42.34|         1019.72|            94.67|       465.36|
|      13.87|         45.08|         1024.42|            81.69|       465.48|
|      13.72|          54.3|         1017.89|            79.08|       467.05|
|      15.14|         49.64|         1023.78|             75.0|       463.58|
|      23.63|         60.93|         1006.41|            83.06|        439.9|
|      22.99|         61.02|         1009.88|            81.56|       440.85|
|      12.85|          40.0|         1015.89|            68.85|       463.74|
|      29.88|         68.08|         1011.14|            55.78|       429.33|
|      23.22|         66.56|         1002.47|            85.39| 

In [0]:
from pyspark.sql.functions import corr

In [0]:
data.select(corr('temperature','energy_output')).show()

+--------------------------------+
|corr(temperature, energy_output)|
+--------------------------------+
|             -0.9481284704167571|
+--------------------------------+



In [0]:
data.select(corr('exhaust_vacuum','energy_output')).show()

+-----------------------------------+
|corr(exhaust_vacuum, energy_output)|
+-----------------------------------+
|                -0.8697803096577876|
+-----------------------------------+



In [0]:
data.select(corr('ambient_pressure','energy_output')).show()

+-------------------------------------+
|corr(ambient_pressure, energy_output)|
+-------------------------------------+
|                    0.518429027361572|
+-------------------------------------+



In [0]:
data.select(corr('relative_humidity','energy_output')).show()

+--------------------------------------+
|corr(relative_humidity, energy_output)|
+--------------------------------------+
|                    0.3897940997901016|
+--------------------------------------+



In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
ass=VectorAssembler(inputCols=['temperature',
                              'exhaust_vacuum',
                              'ambient_pressure',
                              'relative_humidity'],outputCol='feature')

In [0]:
finaldata=ass.transform(data)

In [0]:
finaldata.show()

+-----------+--------------+----------------+-----------------+-------------+--------------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|energy_output|             feature|
+-----------+--------------+----------------+-----------------+-------------+--------------------+
|       9.59|         38.56|         1017.01|             60.1|        481.3|[9.59,38.56,1017....|
|      12.04|         42.34|         1019.72|            94.67|       465.36|[12.04,42.34,1019...|
|      13.87|         45.08|         1024.42|            81.69|       465.48|[13.87,45.08,1024...|
|      13.72|          54.3|         1017.89|            79.08|       467.05|[13.72,54.3,1017....|
|      15.14|         49.64|         1023.78|             75.0|       463.58|[15.14,49.64,1023...|
|      23.63|         60.93|         1006.41|            83.06|        439.9|[23.63,60.93,1006...|
|      22.99|         61.02|         1009.88|            81.56|       440.85|[22.99,61.02,1009...|
|      12.

In [0]:
finaldata=finaldata.select('feature','energy_output')

In [0]:
finaldata.show()

+--------------------+-------------+
|             feature|energy_output|
+--------------------+-------------+
|[9.59,38.56,1017....|        481.3|
|[12.04,42.34,1019...|       465.36|
|[13.87,45.08,1024...|       465.48|
|[13.72,54.3,1017....|       467.05|
|[15.14,49.64,1023...|       463.58|
|[23.63,60.93,1006...|        439.9|
|[22.99,61.02,1009...|       440.85|
|[12.85,40.0,1015....|       463.74|
|[29.88,68.08,1011...|       429.33|
|[23.22,66.56,1002...|       437.11|
|[9.02,38.08,1019....|       478.32|
|[26.12,75.6,1017....|       439.68|
|[16.66,36.71,1013...|       466.07|
|[29.56,52.84,1006...|       436.46|
|[30.0,61.5,1009.4...|        435.6|
|[30.39,70.98,1007...|       435.48|
|[20.88,47.45,1007...|       449.34|
|[28.14,61.47,1009...|       435.35|
|[18.86,46.48,1007...|        452.0|
|[31.85,68.3,1014....|       428.72|
+--------------------+-------------+
only showing top 20 rows



In [0]:
train_data,test_data=finaldata.randomSplit([0.7,0.3])

In [0]:
finaldata.count()

Out[20]: 9568

In [0]:
train_data.count()

Out[21]: 6655

In [0]:
test_data.count()

Out[22]: 2913

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
algo=LinearRegression(featuresCol='feature',labelCol='energy_output')

In [0]:
model=algo.fit(train_data)

In [0]:
model.coefficients

Out[28]: DenseVector([-1.9788, -0.234, 0.0597, -0.1588])

In [0]:
model.intercept

Out[29]: 457.1244944549918

In [0]:
ypred=model.transform(test_data)

In [0]:
ypred.show(20)

+--------------------+-------------+------------------+
|             feature|energy_output|        prediction|
+--------------------+-------------+------------------+
|[1.81,39.42,1026....|       490.55| 493.4144615530893|
|[2.34,39.42,1028....|       490.34|493.61560787261044|
|[2.58,39.42,1028....|       488.69|493.25643120387076|
|[3.2,41.31,997.67...|       489.86| 485.0031583015412|
|[3.21,38.44,1016....|       491.35| 488.7876262083676|
|[3.38,39.64,1011....|       488.92| 488.6310042664904|
|[3.4,39.64,1011.1...|       459.86|488.24654105617276|
|[3.91,35.47,1016....|       488.67| 488.1478598074893|
|[3.94,39.9,1008.0...|       488.81|  484.703486261275|
|[3.95,35.47,1017....|       488.64| 488.2775532091493|
|[3.98,35.47,1017....|       489.64| 487.9478774311593|
|[3.99,39.64,1011....|       492.06| 487.0809170469885|
|[4.04,35.47,1017....|       486.86| 487.7162828395649|
|[4.08,35.19,1018....|       489.44|486.24071260675873|
|[4.15,39.9,1007.6...|        489.8|484.54743622

In [0]:
test=test_data.select('feature')

In [0]:
test.show(5)

+--------------------+
|             feature|
+--------------------+
|[1.81,39.42,1026....|
|[2.34,39.42,1028....|
|[2.58,39.42,1028....|
|[3.2,41.31,997.67...|
|[3.21,38.44,1016....|
+--------------------+
only showing top 5 rows



In [0]:
model.transform(test).show()

+--------------------+------------------+
|             feature|        prediction|
+--------------------+------------------+
|[1.81,39.42,1026....| 493.4144615530893|
|[2.34,39.42,1028....|493.61560787261044|
|[2.58,39.42,1028....|493.25643120387076|
|[3.2,41.31,997.67...| 485.0031583015412|
|[3.21,38.44,1016....| 488.7876262083676|
|[3.38,39.64,1011....| 488.6310042664904|
|[3.4,39.64,1011.1...|488.24654105617276|
|[3.91,35.47,1016....| 488.1478598074893|
|[3.94,39.9,1008.0...|  484.703486261275|
|[3.95,35.47,1017....| 488.2775532091493|
|[3.98,35.47,1017....| 487.9478774311593|
|[3.99,39.64,1011....| 487.0809170469885|
|[4.04,35.47,1017....| 487.7162828395649|
|[4.08,35.19,1018....|486.24071260675873|
|[4.15,39.9,1007.6...|484.54743622598437|
|[4.27,39.64,1010....|486.29298971154685|
|[4.49,38.44,1015....| 487.6585587673985|
|[4.49,40.27,1012....|  487.096780796097|
|[4.56,40.27,1011....|486.31196233477135|
|[4.59,39.33,1010....|487.79894345738603|
+--------------------+------------

In [0]:
test2=[[45,78,4585,895]]
test2=sc.createDataFrame(test2,schema=['temperature',
                              'exhaust_vacuum',
                              'ambient_pressure',
                              'relative_humidity'])

In [0]:
test2=ass.transform(test2)

In [0]:
model.transform(test2).show()

+-----------+--------------+----------------+-----------------+--------------------+-----------------+
|temperature|exhaust_vacuum|ambient_pressure|relative_humidity|             feature|       prediction|
+-----------+--------------+----------------+-----------------+--------------------+-----------------+
|         45|            78|            4585|              895|[45.0,78.0,4585.0...|481.4994184087565|
+-----------+--------------+----------------+-----------------+--------------------+-----------------+

