In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("yarn") \
    .appName("pantheon") \
    .getOrCreate()

In [24]:
stazioneDf = spark.read.format('com.databricks.spark.csv')\
.options(delimiter=';',header='true', inferschema='true').load('input/pantheon20190612-stazione.csv')
stazioneDf.take(1)

[Row(id_dato=1811253, data_ora=datetime.datetime(2018, 10, 12, 14, 25, 15), temp1_media=25.99, temp1_min=25.93, temp1_max=26.08, ur1_media=56.69, ur1_min=52.56, ur1_max=58.97, temp1_ur1_n_letture=5, pioggia_mm=0.0, rad W/mq=75.0, rad_n_letture=5, wind_dir=6, wind_dir_n_letture=5, wind_speed_media=0.3, wind_speed_max=15.62, wind_speed_n_letture=25, pressione_mbar=1085.94, pressione_standard_mbar=1118.68, pressione_n_letture=5, rad W/mq array='{}')]

In [3]:
stazioneDf.printSchema()

root
 |-- id_dato: integer (nullable = true)
 |-- data_ora: timestamp (nullable = true)
 |-- temp1_media: double (nullable = true)
 |-- temp1_min: double (nullable = true)
 |-- temp1_max: double (nullable = true)
 |-- ur1_media: double (nullable = true)
 |-- ur1_min: double (nullable = true)
 |-- ur1_max: double (nullable = true)
 |-- temp1_ur1_n_letture: integer (nullable = true)
 |-- pioggia_mm: double (nullable = true)
 |-- rad W/mq: double (nullable = true)
 |-- rad_n_letture: integer (nullable = true)
 |-- wind_dir: integer (nullable = true)
 |-- wind_dir_n_letture: integer (nullable = true)
 |-- wind_speed_media: double (nullable = true)
 |-- wind_speed_max: double (nullable = true)
 |-- wind_speed_n_letture: integer (nullable = true)
 |-- pressione_mbar: double (nullable = true)
 |-- pressione_standard_mbar: double (nullable = true)
 |-- pressione_n_letture: integer (nullable = true)
 |-- rad W/mq array: string (nullable = true)



## Verifica dei valori nulli

In [25]:
stazioneDf.filter(stazioneDf.id_dato.isNotNull()).count()

30802

In [26]:
stazioneDf.filter(stazioneDf.temp1_media.isNotNull()).count()

30802

In [27]:
stazioneDf.filter(stazioneDf.ur1_media.isNotNull()).count()

30802

In [28]:
stazioneDf.filter(stazioneDf.wind_speed_media.isNotNull()).count()

22525

In [29]:
stazioneDf.filter(stazioneDf['rad W/mq'].isNotNull()).count()

20206

## Calcolo valori medi

In [30]:
from pyspark.sql.functions import avg
def fill_with_mean(df, include=set()): 
    stats = df.agg(*(
        avg(c).alias(c) for c in df.columns if c in include
    ))
    return df.na.fill(stats.first().asDict())

In [31]:
stazioneDf = fill_with_mean(stazioneDf, ['wind_speed_media'])

In [32]:
stazioneDf.filter(stazioneDf.wind_speed_media.isNotNull()).count()

30802

In [33]:
#n.b per esprimere più condizioni in and o or bisogna racchiudere le condizioni tra parentesi
from pyspark.sql.functions import hour
avgT=stazioneDf.filter((stazioneDf['rad W/mq'].isNotNull())& \
                   ((hour(stazioneDf['data_ora'])<=18)|(hour(stazioneDf['data_ora'])>5)))
avgT=stazioneDf.filter((stazioneDf['rad W/mq'].isNotNull())& \
                   ((hour(stazioneDf['data_ora'])<=18)|(hour(stazioneDf['data_ora'])>5)))
from pyspark.sql.functions import mean
stats=avgT.select([mean('rad W/mq')]).first()
avgRadValue= stats.asDict()
avgRadValue = avgRadValue['avg(rad W/mq)']

In [34]:
avgRadValue

113.87738295555782

In [35]:
from pyspark.sql.functions import when,hour

stazioneDf = stazioneDf.withColumn('rad W/mq',when((stazioneDf['rad W/mq'].isNull())& ((hour(stazioneDf['data_ora'])>18)|\
                                               (hour(stazioneDf['data_ora'])<=5)),0)\
                               .when((stazioneDf['rad W/mq'].isNull())& ((hour(stazioneDf['data_ora'])<=18)|\
                                               (hour(stazioneDf['data_ora'])>5)),avgRadValue)\
                               .otherwise(stazioneDf['rad W/mq']))

In [36]:
stazioneDf.filter((stazioneDf['rad W/mq'].isNull())).count()

0

In [37]:
stazioneDf.select(stazioneDf['rad W/mq'],stazioneDf['data_ora']).show()

+--------+-------------------+
|rad W/mq|           data_ora|
+--------+-------------------+
|    75.0|2018-10-12 14:25:15|
|   104.1|2018-10-12 14:30:12|
|   234.4|2018-10-12 14:35:11|
|   247.2|2018-10-12 14:40:13|
|   206.6|2018-10-12 14:45:13|
|   250.3|2018-10-12 14:50:10|
|   307.5|2018-10-12 14:55:13|
|   360.5|2018-10-12 15:00:15|
|   307.5|2018-10-12 15:05:11|
|   388.4|2018-10-12 15:10:13|
|   367.5|2018-10-12 15:15:13|
|   348.9|2018-10-12 15:20:10|
|   340.4|2018-10-12 15:25:13|
|   321.8|2018-10-12 15:30:15|
|   304.4|2018-10-12 15:35:11|
|   294.4|2018-10-12 15:40:13|
|   277.0|2018-10-12 15:45:13|
|   259.8|2018-10-12 15:50:12|
|   231.3|2018-10-12 16:00:13|
|   195.2|2018-10-12 16:15:10|
+--------+-------------------+
only showing top 20 rows



## Aggiunta della colonna evapotraspirazione

In [None]:
#Et0 = 0.0393 Rs * sqrt(T+9.5)-0.19* Rs^0.6 * lat^0.15 +0.048 *(T+20)(1-UMI/100)* u2^0.7
LATITUDE=0.73

In [58]:
from pyspark.sql.functions import to_date,hour
from pyspark.sql.functions import sum,avg 
# con questa operazione mi calcolo un dataframe in cui ho per ogni giorno la temperatura minima e massima
maxMinTemperaterByDay = stazioneDf.select(to_date(stazioneDf['data_ora']).alias('giorno'),stazioneDf['temp1_min'],stazioneDf['temp1_max'])\
    .groupBy('giorno').agg({"temp1_max":"max","temp1_min":"min"})\
    .withColumnRenamed("max(temp1_max)","t_max").withColumnRenamed("min(temp1_min)","t_min")

In [84]:

eachHourDf = stazioneDf.select(["data_ora","wind_speed_media","temp1_media","rad W/mq","ur1_media"])\
.groupBy(to_date(stazioneDf['data_ora']).alias("data"),hour(stazioneDf['data_ora'])).agg({"wind_speed_media":"avg","rad W/mq":"avg","ur1_media":"avg"\
                                               ,"temp1_media":"avg"}).\
    withColumnRenamed('avg(rad W/mq)','Rs').withColumnRenamed('avg(temp1_media)','T')\
    .withColumnRenamed('avg(ur1_media)','RH').withColumnRenamed('avg(wind_speed_media)','U2')

In [79]:
type(eachHourDf)

pyspark.sql.dataframe.DataFrame

In [66]:
# join tra tabella oraria e tabella delle temperature minime e massime, per avere per ogni record della prima
# tabella la temperatura giornaliera massima e minima, utile per il calcolo di Et0
eachHourDf.join(maxMinTemperaterByDay,eachHourDf.data==maxMinTemperaterByDay.giorno,'inner')\
.show(10)

+----------+--------------+-----------------+-------------------+---------------------+------------------+----------+-----+-----+
|      data|hour(data_ora)|   avg(ur1_media)|      avg(rad W/mq)|avg(wind_speed_media)|  avg(temp1_media)|    giorno|t_max|t_min|
+----------+--------------+-----------------+-------------------+---------------------+------------------+----------+-----+-----+
|2018-10-17|            18|90.60000000000001|                0.3|        4.27066381798|           17.1825|2018-10-17|23.01|12.09|
|2018-10-18|            18|           89.125|               0.45|    5.654218423973333|             16.18|2018-10-18|27.01| 12.3|
|2018-10-20|            20|           78.815|                0.5|   1.9810546059933332|           13.4375|2018-10-20|25.51| 8.98|
|2018-10-29|             4|86.75999999999999|               0.65|                 7.67|16.830000000000002|2018-10-29|21.19| 9.98|
|2018-11-01|            18|94.44250000000001|0.44999999999999996|               0.6025|   

In [None]:
from pyspark.sql.functions import col
eachHourDf = eachHourDf.select(col("avg(ur1_media)").alias("RH"),col("avg(temp1_media)").alias("T"),
                              col("avg(wind_speed_media)").alias("u2"),col())
eachHourDf.show()
eachHourDf.printSchema()


In [80]:
import math
eachHourDf.withColumn('delta',4098*eachHourDf['avg(temp1_media)']/((eachHourDf['avg(temp1_media)']+237.3)**2))

AnalysisException: 'Cannot resolve column name "avg(temp1_media)" among (data, hour(data_ora), RH, Rs, U2, T);'

## Stima della Et0 con la formula di Valiantzas

In [85]:
eachHourDf = eachHourDf.withColumn('Rs',eachHourDf['Rs']*0.0864)

In [90]:
eachHourDf = eachHourDf.withColumn('Et0',0.0393*eachHourDf['Rs']*((eachHourDf['T']+9.5)**0.5)-0.19*(eachHourDf['Rs']**0.6)\
                                   *(0.73**0.15)+0.048*(eachHourDf['T']+20)\
                                   *(1-(eachHourDf['RH']/100))*(eachHourDf['U2']**0.7))

In [95]:
eachHourDf.select('data','Et0').count()

5823

In [94]:
from pyspark.ml.feature import VectorAssembler

# prendiamo le temperature medie, minime emassime come feature

vectorAssembler = VectorAssembler(inputCols = ['RH','Rs','T','U2'], outputCol = 'features')
vstazione_df = vectorAssembler.transform(eachHourDf)
vstazione_df = vstazione_df.select(['features', 'Et0'])
vstazione_df.show(3)

+--------------------+-------------------+
|            features|                Et0|
+--------------------+-------------------+
|[90.6000000000000...|0.44851653104635747|
|[89.125,0.0388800...|  0.616953535945681|
|[78.815,0.0432,13...| 0.5293128453593872|
+--------------------+-------------------+
only showing top 3 rows



In [96]:
# divisione training set e test set

splits = vstazione_df.randomSplit([0.8,0.2])
train_df = splits[0]
test_df = splits[1]

In [101]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='Et0', maxIter=20, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-0.04690388917603679,0.15422079514605752,0.02891093063249502,0.0796713754970911]
Intercept: 4.016408716508319


In [102]:
# calcolo del root means square. Siccome stiamo lavorando con dati della radiazione di ordini di grandezza
# molto alti, riteniamo accettabile un errore di circa 150

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.706690
r2: 0.958735


In [103]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 0.725374


## Uso di random forest per il modello di regressione

In [106]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features",labelCol="Et0")
rf_model = rf.fit(train_df)
predictions = rf_model.transform(test_df)

In [107]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="Et0", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.578568


# Stima dei valori di radiazione mancante

In [None]:
stazioneDfWithNoNull=stazioneDf.filter(stazioneDf['rad W/mq'].isNotNull())
stazioneDfWithNoNull.select(stazioneDf['rad W/mq'].isNotNull()).count()

In [None]:
from pyspark.ml.feature import VectorAssembler

# prendiamo le temperature medie, minime emassime come feature

vectorAssembler = VectorAssembler(inputCols = ['temp1_media','temp1_min','temp1_max'], outputCol = 'features')
vstazione_df = vectorAssembler.transform(stazioneDfWithNoNull)
vstazione_df = vstazione_df.select(['features', 'rad W/mq'])
vstazione_df.show(3)

In [None]:
# divisione training set e test set

splits = vstazione_df.randomSplit([0.8,0.2])
train_df = splits[0]
test_df = splits[1]

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='rad W/mq', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [None]:
# calcolo del root means square. Siccome stiamo lavorando con dati della radiazione di ordini di grandezza
# molto alti, riteniamo accettabile un errore di circa 150

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [None]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","rad W/mq","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [None]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'rad W/mq')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="rad W/mq", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
stazioneDf.select('rad W/mq').rdd.max()[0]

In [None]:
trainingglrSummary = glr_model.summary
trainingglrSummary.coefficientStandardErrors