### Solar Radiation Prediction Task from NASA Hackathon by Implementing Linear Regression

###  By: Sara Khosravi
-----------------------------------------
###############################################################################################################################

In [1]:
#set environment
import os
import sys
 
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
#import Sparksession driver
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Regression of Solar Of Prediction") \
    .getOrCreate()

In [5]:
df = spark.read.csv('data/Solar_Prediction.csv',inferSchema=True, header=True)
df.show()


+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+
|                Date|  UNIXTime|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|        sunrise_time|         sunset_time|
+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+
|2016-09-29 23:55:...|1475229326|     1.21|         48|   30.46|      59|                177.39| 5.62|2016-09-29 06:13:...|2016-09-29 18:13:...|
|2016-09-29 23:50:...|1475229023|     1.21|         48|   30.46|      58|                176.78| 3.37|2016-09-29 06:13:...|2016-09-29 18:13:...|
|2016-09-29 23:45:...|1475228726|     1.23|         48|   30.46|      57|                158.75| 3.37|2016-09-29 06:13:...|2016-09-29 18:13:...|
|2016-09-29 23:40:...|1475228421|     1.21|         48|   30.46|      60|                137.71| 3.37|2016-09-29 06:13:...|2016-09

In [7]:
#renaming the columns
#df = df.toDF('UNIXTime','Data','Time','Radiation','Temperature','Pressure','Humidity','WindDirection','Speed','TimeSunRise','TimeSunSet')
#df.show(5)

In [8]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- UNIXTime: integer (nullable = true)
 |-- Radiation: double (nullable = true)
 |-- Temperature: integer (nullable = true)
 |-- Pressure: double (nullable = true)
 |-- Humidity: integer (nullable = true)
 |-- WindDirection(Degrees): double (nullable = true)
 |-- Speed: double (nullable = true)
 |-- sunrise_time: string (nullable = true)
 |-- sunset_time: string (nullable = true)



In [9]:
#Check for missing values
for col in df.columns:
    print("no. of cells in column", col, "with null values:", df.filter(df[col].isNull()).count())

no. of cells in column Date with null values: 0
no. of cells in column UNIXTime with null values: 0
no. of cells in column Radiation with null values: 0
no. of cells in column Temperature with null values: 0
no. of cells in column Pressure with null values: 0
no. of cells in column Humidity with null values: 0
no. of cells in column WindDirection(Degrees) with null values: 0
no. of cells in column Speed with null values: 0
no. of cells in column sunrise_time with null values: 0
no. of cells in column sunset_time with null values: 0


In [10]:
# Create a vector from indepenndent variables 
# and save it in only one variable, here is features
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Temperature','Pressure','Humidity','WindDirection(Degrees)','Speed'], 
                            outputCol="features")
# Create a vector using independent variables and a new column as features containing the vector
feature_vec=assembler.transform(df)
# Show 5 items from features column
feature_vec.select("features",'Radiation').take(5)

[Row(features=DenseVector([48.0, 30.46, 59.0, 177.39, 5.62]), Radiation=1.21),
 Row(features=DenseVector([48.0, 30.46, 58.0, 176.78, 3.37]), Radiation=1.21),
 Row(features=DenseVector([48.0, 30.46, 57.0, 158.75, 3.37]), Radiation=1.23),
 Row(features=DenseVector([48.0, 30.46, 60.0, 137.71, 3.37]), Radiation=1.21),
 Row(features=DenseVector([48.0, 30.46, 62.0, 104.95, 5.62]), Radiation=1.17)]

In [11]:
# Shows 5 first rows of the feature_vec
feature_vec.show(5)

+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+--------------------+
|                Date|  UNIXTime|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|        sunrise_time|         sunset_time|            features|
+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+--------------------+
|2016-09-29 23:55:...|1475229326|     1.21|         48|   30.46|      59|                177.39| 5.62|2016-09-29 06:13:...|2016-09-29 18:13:...|[48.0,30.46,59.0,...|
|2016-09-29 23:50:...|1475229023|     1.21|         48|   30.46|      58|                176.78| 3.37|2016-09-29 06:13:...|2016-09-29 18:13:...|[48.0,30.46,58.0,...|
|2016-09-29 23:45:...|1475228726|     1.23|         48|   30.46|      57|                158.75| 3.37|2016-09-29 06:13:...|2016-09-29 18:13:...|[48.0,30.46,57.0,...|
|201

In [12]:
# Split the data into train and test sets
train_data, test_data = feature_vec.randomSplit([.8,.2],seed=1234)

In [13]:
train_data.show(5)

+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+--------------------+
|                Date|  UNIXTime|Radiation|Temperature|Pressure|Humidity|WindDirection(Degrees)|Speed|        sunrise_time|         sunset_time|            features|
+--------------------+----------+---------+-----------+--------+--------+----------------------+-----+--------------------+--------------------+--------------------+
|2016-09-01 00:25:...|1472725505|     2.21|         51|   30.43|     103|                144.12| 18.0|2016-09-01 06:07:...|2016-09-01 18:38:...|[51.0,30.43,103.0...|
|2016-09-01 00:30:...|1472725809|     2.25|         51|   30.43|     103|                 67.42|11.25|2016-09-01 06:07:...|2016-09-01 18:38:...|[51.0,30.43,103.0...|
|2016-09-01 00:45:...|1472726704|     2.15|         51|   30.43|     103|                 67.85|  4.5|2016-09-01 06:07:...|2016-09-01 18:38:...|[51.0,30.43,103.0...|
|201

In [14]:
# Import Linear Regression class
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='Radiation', featuresCol="features",
                     regParam=0.0001, elasticNetParam=0.0)

# Train model with Training Data
lrModel = lr.fit(train_data)
print(lrModel.intercept, lrModel.coefficients)

20341.206320837908 [38.279965202842774,-725.8491917678133,-0.26848256836304957,-0.2628535399108965,7.946736742829567]


In [15]:
# Predict model using test data
predictions = lrModel.transform(test_data)
predictions.printSchema()

root
 |-- Date: string (nullable = true)
 |-- UNIXTime: integer (nullable = true)
 |-- Radiation: double (nullable = true)
 |-- Temperature: integer (nullable = true)
 |-- Pressure: double (nullable = true)
 |-- Humidity: integer (nullable = true)
 |-- WindDirection(Degrees): double (nullable = true)
 |-- Speed: double (nullable = true)
 |-- sunrise_time: string (nullable = true)
 |-- sunset_time: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: double (nullable = true)



In [16]:
predictions.select('Radiation','prediction').show()

+---------+------------------+
|Radiation|        prediction|
+---------+------------------+
|     2.58| 247.3300314748558|
|     2.83| 209.4283196684737|
|     2.16|203.44503750406147|
|     3.55| 191.7087484505646|
|     2.87|  278.437792071938|
|     2.52| 216.9318812570782|
|     2.12| 176.8115612832662|
|     2.26|185.03025588023593|
|     2.12|  206.908010499581|
|     2.03|148.05639166726542|
|     3.35| 168.5431965679236|
|     2.02| 76.58680410242596|
|    87.65| 177.9261320596852|
|   183.46|184.93815789514338|
|   555.78|315.88391002389835|
|   386.25| 312.9828828396676|
|   838.66|  323.199151029723|
|   724.35|304.87072942216037|
|   999.94| 436.4168536997531|
|  1069.16| 576.0632839798709|
+---------+------------------+
only showing top 20 rows



In [17]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Radiation', metricName='mse')
evaluator.evaluate(predictions)

43848.879177361196

In [18]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Radiation', metricName='r2')
evaluator.evaluate(predictions)

0.5708200456871915

In [19]:
#Grid Search
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Using Grid Search we use various parameters of the model to 
# test and get the best model
paramGrid = (ParamGridBuilder()\
             .addGrid(lr.regParam,[0.001,0.01,0.1,1])\
             .addGrid(lr.elasticNetParam,[0.0,0.5,1.0])\
             .build())

# Create 4-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

cvModel = cv.fit(train_data)

In [23]:
list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))

[(0.564015777053631,
  {Param(parent='LinearRegression_4d9f93170f1a6316188a', name='regParam', doc='regularization parameter (>= 0).'): 0.001,
   Param(parent='LinearRegression_4d9f93170f1a6316188a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0}),
 (0.5640157745818117,
  {Param(parent='LinearRegression_4d9f93170f1a6316188a', name='regParam', doc='regularization parameter (>= 0).'): 0.001,
   Param(parent='LinearRegression_4d9f93170f1a6316188a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5}),
 (0.5640157734169582,
  {Param(parent='LinearRegression_4d9f93170f1a6316188a', name='regParam', doc='regularization parameter (>= 0).'): 0.001,
   Param(parent='LinearRegression_4d9f93170f1a6316188a', name='elasticNetParam', doc='the ElasticNet mixing paramet

In [53]:
#Best Model Params
score_params_list = list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))
max(score_params_list,key=lambda item:item[0])

(0.5025963341525772,
 {Param(parent='LinearRegression_432b92b7d486ff2e6f29', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='LinearRegression_432b92b7d486ff2e6f29', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0})

In [27]:
# Since the Accuracy score is low, we should test another models
# Here we try Random forest Regression
from pyspark.ml.regression import RandomForestRegressor

# Create a RandomForest model
rf = RandomForestRegressor(labelCol='Radiation', featuresCol='features',
                           maxDepth=15, minInfoGain=0.001, 
                           seed=0, numTrees=110)

rfModel = cv.fit(train_data)

# Get prediction
predictions = lrModel.transform(test_data)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Radiation', metricName='r2')
evaluator.evaluate(predictions)

0.5708200456871915

In [29]:
# Using Grid Search we use various parameters of the model to test and get the best model
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

model = RandomForestRegressor(labelCol='Radiation', featuresCol='features',
                              minInfoGain=0.001, seed=0)
paramGrid = (ParamGridBuilder()\
             .addGrid(rf.maxDepth,[13,14,15])\
             .addGrid(rf.numTrees,[100,110,120])\
             .build())

evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Radiation', metricName='r2')

# Create 4-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

cvModel = cv.fit(train_data)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1035, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 883, in send_command
    response = connection.send_command(command)
  File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1040, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/hdp/current/spark2-client/python/lib/py4j-0.10.4-src.zip/

Py4JError: org does not exist in the JVM

In [30]:
#Best Model Params
score_params_list = list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))
max(score_params_list,key=lambda item:item[0])

(0.5640158020462434,
 {Param(parent='LinearRegression_4d9f93170f1a6316188a', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='LinearRegression_4d9f93170f1a6316188a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0})

In [None]:
# BY implementiong Random Forest Regression and Linear Regression Models, we got 57% accuracy.