In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.postgresql:postgresql:42.1.1 pyspark-shell'

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("pantheon") \
    .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.11:2.4.0,org.postgresql:postgresql:42.1.1') \
    .config("spark.mongodb.input.uri","mongodb://root:mongodb@mongodb/pantheon.station?authSource=admin")\
    .getOrCreate()

In [2]:
MONGODB_URI='mongodb://mongodb/'
def getCollection(sparksession):
    df = sparksession.read.format("com.mongodb.spark.sql.DefaultSource").load()
    return df

In [3]:
stazioneDf = getCollection(spark)

In [4]:
stazioneDf.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- data_ora: string (nullable = true)
 |-- id_dato: integer (nullable = true)
 |-- pioggia_mm: double (nullable = true)
 |-- pressione_mbar: double (nullable = true)
 |-- pressione_n_letture: integer (nullable = true)
 |-- pressione_standard_mbar: double (nullable = true)
 |-- rad W/mq: double (nullable = true)
 |-- rad W/mq array: string (nullable = true)
 |-- rad_n_letture: double (nullable = true)
 |-- temp1_max: double (nullable = true)
 |-- temp1_media: double (nullable = true)
 |-- temp1_min: double (nullable = true)
 |-- temp1_ur1_n_letture: integer (nullable = true)
 |-- ur1_max: double (nullable = true)
 |-- ur1_media: double (nullable = true)
 |-- ur1_min: double (nullable = true)
 |-- wind_dir: integer (nullable = true)
 |-- wind_dir_n_letture: integer (nullable = true)
 |-- wind_speed_max: double (nullable = true)
 |-- wind_speed_media: double (nullable = true)
 |-- wind_speed_n_letture: doubl

In [8]:
# con questa opzione della Spark Session impostata a True, non c'è più bisogno di usare show per vedere i dataframe;
# inoltre li posso vedere in modo non sballato
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [9]:
stazioneDf

_id,data_ora,id_dato,pioggia_mm,pressione_mbar,pressione_n_letture,pressione_standard_mbar,rad W/mq,rad W/mq array,rad_n_letture,temp1_max,temp1_media,temp1_min,temp1_ur1_n_letture,ur1_max,ur1_media,ur1_min,wind_dir,wind_dir_n_letture,wind_speed_max,wind_speed_media,wind_speed_n_letture
[5d31ceae4bcedc5c...,2018-10-12 14:35:11,1811283,0.0,1085.7,5,1118.44,234.4,{},5.0,24.98,24.34,23.88,5,54.04,52.91,51.54,52,5,9.43,3.89,322.0
[5d31ceae4bcedc5c...,2018-10-12 14:45:13,1811304,0.0,1086.11,5,1118.85,206.6,{},5.0,23.86,23.74,23.66,5,56.69,55.22,53.6,72,5,5.7,1.93,160.0
[5d31ceae4bcedc5c...,2018-10-12 14:25:15,1811253,0.0,1085.94,5,1118.68,75.0,{},5.0,26.08,25.99,25.93,5,58.97,56.69,52.56,6,5,15.62,0.3,25.0
[5d31ceae4bcedc5c...,2018-10-12 14:40:13,1811284,0.0,1088.63,5,1121.37,247.2,{},5.0,23.89,23.82,23.76,5,54.91,54.17,53.44,44,5,8.19,2.37,196.0
[5d31ceae4bcedc5c...,2018-10-12 14:50:10,1811314,0.0,1086.41,5,1119.15,250.3,{},5.0,23.92,23.84,23.76,5,58.4,57.18,55.32,68,5,3.41,1.3,108.0
[5d31ceae4bcedc5c...,2018-10-12 14:55:13,1811315,0.0,1086.35,5,1119.09,307.5,{},5.0,23.97,23.92,23.84,5,56.99,56.15,55.42,42,5,8.1,2.88,239.0
[5d31ceae4bcedc5c...,2018-10-12 15:00:15,1811334,0.0,1086.35,5,1119.09,360.5,{},5.0,24.71,24.45,24.1,5,57.76,54.74,52.34,72,5,6.32,2.63,218.0
[5d31ceae4bcedc5c...,2018-10-12 15:05:11,1811345,0.0,1086.29,5,1119.03,307.5,{},5.0,24.43,24.37,24.3,5,54.93,54.01,53.14,53,5,7.23,2.91,241.0
[5d31ceae4bcedc5c...,2018-10-12 15:20:10,1811376,0.0,1085.94,5,1118.68,348.9,{},5.0,25.33,25.27,25.24,5,48.39,46.8,45.04,60,5,4.69,2.02,167.0
[5d31ceae4bcedc5c...,2018-10-12 15:10:13,1811346,0.0,1086.29,5,1119.03,388.4,{},5.0,24.86,24.71,24.58,5,54.2,52.08,48.88,69,5,8.12,3.13,259.0


## Calcolo somma di pioggia ogni mese

In [5]:
stazioneDf.select('data_ora', 'pioggia_mm')

DataFrame[data_ora: string, pioggia_mm: double]

In [6]:
from pyspark.sql.functions import month, year
from pyspark.sql.functions import sum, avg
# fare lo show() non crea effettivamente l'oggetto ma lo mostra solamente, quindi se voglio creare un oggetto ed
# untilizzarlo, non devo usare lo show in un punto intermedio
df1 = stazioneDf.select(year('data_ora').alias("year"), month('data_ora').alias("month"), "pioggia_mm")\
.groupBy("year", "month").agg(sum("pioggia_mm").alias("total_rain")).orderBy("year", "month")

#df2 = stazioneDf.select(year('data_ora').alias("year"), month('data_ora').alias("month"))

#df_rain = df2.join(df1, 'month').show()

#.groupBy("month").agg(avg("year")).show()
#print(df1)

In [10]:
df1

year,month,total_rain
2018,10,13.15999999999998
2018,11,40.67999999999985
2018,12,9.079999999999988
2019,1,23.59999999999991
2019,2,16.839999999999968
2019,3,1.4
2019,4,13.15999999999996
2019,5,37.5599999999997
2019,6,0.0


## Rilevamento di anomalie

In [15]:
from pyspark.sql import SparkSession
df1.createOrReplaceTempView("table1")
# spark.sql("SELECT year, month, sum(pioggia_mm) AS total_rain FROM table1 GROUP BY year, month")

In [None]:
# job che rileva se per un certo periodo di tempo ci sono valori nulli su una o più colonne, manda un alert
# o comunque da' TRUE. Questo può essere utile per capire se i sensori si sono guastati

In [22]:
# questa cella serve ad individuare i blocchi del dataframe accomunati da una caratteristica comune, ad esempio in questo caso
# vogliamo raggruppare tutti i blocchi con radiazione nulla e tutti quelli con radiazione non nulla
# la funzione lag serve , data una certa riga , a selezionare la riga che si trova ad un certo offset da questa, nel nostro caso 1
# usiamo lag per selezionare la riga subito sotto alla riga di inizio di un blocco 
# coalesce serve per selezionare, data una lista di colonne, la prima colonna non nulla
# count considera nel conteggio solo le celle non vuote, così può essere utile per individuare una sottocolonna con tutti valori non nulli
import pyspark.sql.functions as F
from pyspark.sql.window import Window
nullRadSlot = stazioneDf
nullRadSlot = nullRadSlot.withColumn("isnull", F.when(nullRadSlot['rad W/mq'].isNull(), True).otherwise(False))
nullRadSlot = nullRadSlot.withColumn("lag_isnull", F.lag(nullRadSlot["isnull"],1).over(Window.orderBy(nullRadSlot["data_ora"])))
nullRadSlot = nullRadSlot.withColumn("change", F.coalesce(nullRadSlot["isnull"]!=nullRadSlot["lag_isnull"],F.lit(False)))
nullRadSlot = nullRadSlot.withColumn("block", F.sum(nullRadSlot["change"].cast("int")).over(Window.orderBy(nullRadSlot["data_ora"])))\
  .groupBy("block")\
  .agg(F.min(nullRadSlot["data_ora"]).alias('mindata'),
    F.max(nullRadSlot["data_ora"]).alias('maxdata'),
    (F.count(nullRadSlot['rad W/mq'])==0).alias('blocco_isnull'))

In [23]:
nullRadSlot

block,mindata,maxdata,blocco_isnull
0,2018-10-12 14:25:15,2019-05-06 08:55:28,False
1,2019-05-06 13:40:10,2019-06-12 09:25:15,True


In [30]:
timeFmt = "yyyy-MM-dd HH:mm:ss"
nullRadSlot = nullRadSlot.withColumn("Duration", F.unix_timestamp('maxdata', format=timeFmt)\
                                     - F.unix_timestamp('mindata', format=timeFmt))

In [32]:
nullRadSlot.withColumn("Duration",nullRadSlot.Duration/(3600*24))

block,mindata,maxdata,blocco_isnull,Duration
0,2018-10-12 14:25:15,2019-05-06 08:55:28,False,205.7709837962963
1,2019-05-06 13:40:10,2019-06-12 09:25:15,True,36.82297453703704


In [36]:
guasti = nullRadSlot.select('mindata','maxdata','Duration').filter(nullRadSlot['blocco_isnull']==True)

## Calcolo temperatura media ed umidità settimanale

In [None]:
from pyspark.sql.functions import weekofyear
stazioneDf.select(weekofyear('data_ora'), 'temp1_media', 'ur1_media')

In [None]:
df2 = stazioneDf.select(where(stazioneDf.year('data_ora') == 2, stazioneDf.year('data_ora') + 1).alias("year"), \
                        weekofyear('data_ora').alias("week"), \
                        "temp1_media", \
                        "ur1_media")\
.groupBy("week").agg(avg("temp1_media").alias("mean_temp1"),\
                              avg("ur1_media").alias("mean_ur1")).orderBy("week")

In [None]:
df2

## Connessione a postgresql

In [13]:
driver = "org.sqlite.JDBC"
url="jdbc:postgresql://postgres/report"
tablename = "prova"
properties = {
    "driver": "org.postgresql.Driver",
    "user": "postgres",
    "password": "password"
}
dbDataFrame = spark.read.jdbc(url=url,table=tablename,properties=properties)

In [None]:
dbDataFrame.show()

In [14]:
mode = 'overwrite'
df1.write.jdbc(url=url, table="monthly_rain", mode=mode, properties=properties)

In [37]:
mode = 'overwrite'
guasti.write.jdbc(url=url, table="guasti", mode=mode, properties=properties)