## Lettura del dataset da MongoDB

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("pantheon") \
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.11:2.4.0,org.postgresql:postgresql:42.1.1') \
    .config("spark.mongodb.input.uri","mongodb://root:mongodb@mongodb/pantheon.station?authSource=admin")\
    .getOrCreate()

In [2]:
MONGODB_URI='mongodb://mongodb/'
def getCollection(sparksession):
    df = sparksession.read.format("com.mongodb.spark.sql.DefaultSource").load()
    return df

In [5]:
stazioneDf = getCollection(spark)

In [6]:
stazioneDf.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- data_ora: string (nullable = true)
 |-- id_dato: integer (nullable = true)
 |-- pioggia_mm: double (nullable = true)
 |-- pressione_mbar: double (nullable = true)
 |-- pressione_n_letture: integer (nullable = true)
 |-- pressione_standard_mbar: double (nullable = true)
 |-- rad W/mq: double (nullable = true)
 |-- rad W/mq array: string (nullable = true)
 |-- rad_n_letture: double (nullable = true)
 |-- temp1_max: double (nullable = true)
 |-- temp1_media: double (nullable = true)
 |-- temp1_min: double (nullable = true)
 |-- temp1_ur1_n_letture: integer (nullable = true)
 |-- ur1_max: double (nullable = true)
 |-- ur1_media: double (nullable = true)
 |-- ur1_min: double (nullable = true)
 |-- wind_dir: integer (nullable = true)
 |-- wind_dir_n_letture: integer (nullable = true)
 |-- wind_speed_max: double (nullable = true)
 |-- wind_speed_media: double (nullable = true)
 |-- wind_speed_n_letture: doubl

# Verifica dei valori nulli

In [3]:
stazioneDf.filter(stazioneDf.id_dato.isNotNull()).count()

30802

In [9]:
stazioneDf.filter(stazioneDf.temp1_media.isNotNull()).count()

30802

In [10]:
stazioneDf.filter(stazioneDf.ur1_media.isNotNull()).count()

30802

In [11]:
stazioneDf.filter(stazioneDf.wind_speed_media.isNotNull()).count()

22525

In [12]:
stazioneDf.filter(stazioneDf['rad W/mq'].isNotNull()).count()

20206

In [13]:
stazioneDf.filter(stazioneDf['pressione_mbar'].isNotNull()).count()

30802

## Riempimento dei valori nulli del dataframe

In [14]:
def fill_with_mean(df, include=set()): 
    stats = df.agg(*(
        F.avg(c).alias(c) for c in df.columns if c in include
    ))
    return df.na.fill(stats.first().asDict())

In [15]:
stazioneDf = fill_with_mean(stazioneDf, ['wind_speed_media'])

In [16]:
# Per verificare se le righe sono state riempite correttamente
stazioneDf.filter(stazioneDf.wind_speed_media.isNull()).count()

0

In [18]:
#n.b per esprimere più condizioni in and o or bisogna racchiudere le condizioni tra parentesi tonde
from pyspark.sql.functions import hour
avgT=stazioneDf.filter((stazioneDf['rad W/mq'].isNotNull())& \
                   ((hour(stazioneDf['data_ora'])<=18)|(hour(stazioneDf['data_ora'])>5)))
avgT=stazioneDf.filter((stazioneDf['rad W/mq'].isNotNull())& \
                   ((hour(stazioneDf['data_ora'])<=18)|(hour(stazioneDf['data_ora'])>5)))
from pyspark.sql.functions import mean
stats=avgT.select([mean('rad W/mq')]).first()
avgRadValue= stats.asDict()
avgRadValue = avgRadValue['avg(rad W/mq)']

In [19]:
avgRadValue

113.87738295555782

In [20]:
from pyspark.sql.functions import when,hour

stazioneDf = stazioneDf.withColumn('rad W/mq',when((stazioneDf['rad W/mq'].isNull())& ((hour(stazioneDf['data_ora'])>18)|\
                                               (hour(stazioneDf['data_ora'])<=5)),0)\
                               .when((stazioneDf['rad W/mq'].isNull())& ((hour(stazioneDf['data_ora'])<=18)|\
                                               (hour(stazioneDf['data_ora'])>5)),avgRadValue)\
                               .otherwise(stazioneDf['rad W/mq']))

In [23]:
stazioneDf.filter((stazioneDf['rad W/mq'].isNull())).count()

0

In [24]:
# con questa opzione della Spark Session impostata a True, non c'è più bisogno di usare show per vedere i dataframe;
# inoltre li posso vedere in modo non sballato
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [25]:
#Controllo per verificare che non siano sovrascritti valori non nulli
stazioneDf.select(stazioneDf['rad W/mq'],stazioneDf['data_ora'])

rad W/mq,data_ora
104.1,2018-10-12 14:30:12
234.4,2018-10-12 14:35:11
206.6,2018-10-12 14:45:13
75.0,2018-10-12 14:25:15
307.5,2018-10-12 14:55:13
250.3,2018-10-12 14:50:10
360.5,2018-10-12 15:00:15
367.5,2018-10-12 15:15:13
307.5,2018-10-12 15:05:11
388.4,2018-10-12 15:10:13


## Aggiunta della colonna evapotraspirazione

In [26]:
#Et0 = 0.0393 Rs * sqrt(T+9.5)-0.19* Rs^0.6 * lat^0.15 +0.048 *(T+20)(1-UMI/100)* u2^0.7
LATITUDE=0.73

In [48]:
from pyspark.sql.functions import to_date,hour,month,dayofyear
from pyspark.sql.functions import sum,avg 
# con questa operazione mi calcolo un dataframe in cui ho per ogni mese la temperatura minima e massima
# utile per l'eventuale stima di radiazione solare con la formula di Hargreaves
maxMinTemperaterByMonth = stazioneDf.select(to_date(stazioneDf['data_ora']).alias('data')
                                            ,dayofyear(to_date(stazioneDf['data_ora'])).alias('giorno'),stazioneDf['temp1_media'])
#aggiungo una colonna per fare due diverse aggregazioni sullo stesso valore
maxMinTemperaterByMonth = maxMinTemperaterByMonth.withColumn('temp_media_copia',maxMinTemperaterByMonth['temp1_media'])

In [49]:
maxMinTemperaterByMonth = maxMinTemperaterByMonth\
    .groupBy(month('data')).agg({"temp1_media":"max","temp_media_copia":"min"})\

In [50]:
maxMinTemperaterByMonth

month(data),max(temp1_media),min(temp_media_copia)
12,17.32,-5.28
1,13.8,-4.95
6,32.13,7.27
3,23.02,-2.48
5,22.63,1.33
4,26.61,-0.67
10,26.36,4.0
11,19.63,0.36
2,19.94,-1.84


In [52]:
#Questa cella serve aggregare i valori per ora, in base da avere valori più realistici di evapotraspirazione
eachHourDf = stazioneDf.select(["data_ora","wind_speed_media","temp1_media","rad W/mq","ur1_media"])\
.groupBy(to_date(stazioneDf['data_ora']).alias("data"),hour(stazioneDf['data_ora'])).agg({"wind_speed_media":"avg","rad W/mq":"avg","ur1_media":"avg"\
                                               ,"temp1_media":"avg"}).\
    withColumnRenamed('avg(rad W/mq)','Rs').withColumnRenamed('avg(temp1_media)','T')\
    .withColumnRenamed('avg(ur1_media)','RH').withColumnRenamed('avg(wind_speed_media)','u2')

In [56]:
eachHourDf = eachHourDf.select('data','hour(data_ora)','RH','Rs','u2','T')

In [57]:
import math
eachHourDf.withColumn('delta',4098*eachHourDf['T']/((eachHourDf['T']+237.3)**2))

data,hour(data_ora),RH,Rs,u2,T,delta
2018-10-17,18,90.6,0.3,4.27066381798,17.1825,1.0872826588951532
2018-10-18,18,89.125,0.45,5.654218423973333,16.18,1.0319605003978205
2018-10-20,20,78.815,0.5,1.9810546059933327,13.4375,0.8758945993908427
2018-10-29,4,86.75999999999999,0.65,7.670000000000001,16.83,1.0679334680651833
2018-11-01,18,94.4425,0.4499999999999999,0.6025,14.47,0.9354757316096938
2018-11-07,12,79.975,168.425,5.06,15.43,0.9899751321433904
2018-11-08,11,76.39,469.75,2.9975,16.08,1.0263921807940448
2018-11-11,3,96.07,0.95,5.654218423973333,5.827500000000001,0.4040043743559911
2018-11-27,11,82.28,87.1,7.5625,9.447499999999998,0.6358919576004304
2018-12-09,22,75.84,0.95,4.8775,10.955,0.7284308451562326


## Stima della Et0 con la formula di Valiantzas

In [58]:
#La radiazione solare viene convertita in MJ/day che è l'unita di misura richiesta per la formula di Valiantzas
eachHourDf = eachHourDf.withColumn('Rs',eachHourDf['Rs']*0.0864)

In [59]:
eachHourDf = eachHourDf.withColumn('Et0',0.0393*eachHourDf['Rs']*((eachHourDf['T']+9.5)**0.5)-0.19*(eachHourDf['Rs']**0.6)\
                                   *(0.73**0.15)+0.048*(eachHourDf['T']+20)\
                                   *(1-(eachHourDf['RH']/100))*(eachHourDf['U2']**0.7))

In [60]:
eachHourDf.select('data','Et0').count()

5824

In [61]:
from pyspark.ml.feature import VectorAssembler

# prendiamo le temperature medie, minime emassime come feature

vectorAssembler = VectorAssembler(inputCols = ['RH','Rs','T','u2'], outputCol = 'features')
vstazione_df = vectorAssembler.transform(eachHourDf)
vstazione_df = vstazione_df.select(['features', 'Et0'])
vstazione_df

features,Et0
[90.6000000000000...,0.4485165310463574
"[89.125,0.0388800...",0.616953535945681
"[78.815,0.0432,13...",0.5293128453593872
[86.7599999999999...,0.9534100609984608
[94.4425000000000...,0.0461489337432351
"[79.975,14.551920...",3.0112619427221468
"[76.39,40.5864000...",7.276769233111957
[96.0700000000000...,0.1360142453285036
"[82.28,7.52544,9....",1.7113178838919438
"[75.84,0.08208000...",1.0625971517701909


In [62]:
# divisione training set e test set
splits = vstazione_df.randomSplit([0.8,0.2])
train_df = splits[0]
test_df = splits[1]

In [63]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features', labelCol='Et0', maxIter=20, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-0.04696848381153545,0.154029170408527,0.029911748997163274,0.07910808026162172]
Intercept: 4.018688765017087


In [102]:
# calcolo del root means square. Siccome stiamo lavorando con dati della radiazione di ordini di grandezza
# molto alti, riteniamo accettabile un errore di circa 150

trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.706690
r2: 0.958735


In [49]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 0.726763


In [50]:
#il modello viene salvato automaticamente su hdfs essendo configurato spark su yarn
lr_model.save("output/linear_model_evapotranspiration")

## Uso di random forest per il modello di regressione

In [53]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features",labelCol="Et0")
rf_model = rf.fit(train_df)
predictions = rf_model.transform(test_df)

In [52]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="Et0", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.607774


In [54]:
rf_model.save("output/random_forest_model_evapotranspiration")

## Uso di gradient boost regression 

In [57]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer

featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(train_df)
gbt = GBTRegressor(featuresCol="features",labelCol = 'Et0', maxIter=10)
pipeline = Pipeline(stages=[featureIndexer, gbt])
gbt_model = pipeline.fit(train_df)

In [58]:
predictions = gbt_model.transform(test_df)

In [60]:
evaluator = RegressionEvaluator(
    labelCol="Et0", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = gbt_model.stages[1]
print(gbtModel)  # summary only

Root Mean Squared Error (RMSE) on test data = 0.606528
GBTRegressionModel (uid=GBTRegressor_b9a8f013af8c) with 10 trees


In [61]:
gbt_model.save('output/gbt_model_evapotranspiration')