In [54]:
from pyspark import SparkContext
from pyspark.sql import SQLContext,SparkSession
from pyspark.sql import functions as F

import os
import subprocess
import pandas as pd

from functools import reduce
from datetime import date

import findspark

In [55]:
findspark.init('/opt/spark')

In [56]:
ruta = '/home/jose/Documentos/bd/aire_csv/'

In [57]:
spark = SparkSession.builder.appName("bigdatita").getOrCreate()

In [58]:
lst = ['file://'+os.path.join(ruta,x) for x in os.listdir(ruta)]
print(len(lst))
lst[:2]

107


['file:///home/jose/Documentos/bd/aire_csv/2018NO.csv',
 'file:///home/jose/Documentos/bd/aire_csv/2019NO.csv']

In [59]:
df = spark.read.csv(lst,inferSchema=False,header=True)

In [60]:
df.printSchema()

root
 |-- FECHA: string (nullable = true)
 |-- HORA: string (nullable = true)
 |-- ESTACION: string (nullable = true)
 |-- VALOR: string (nullable = true)
 |-- CONTAMINANTE: string (nullable = true)



In [61]:
df = df.withColumn('FECHA',F.to_timestamp(F.col('FECHA')))
df = df.withColumn('HORA',F.col('HORA').cast('int'))
df = df.withColumn('VALOR',F.col('VALOR').cast('double'))

In [62]:
df.printSchema()

root
 |-- FECHA: timestamp (nullable = true)
 |-- HORA: integer (nullable = true)
 |-- ESTACION: string (nullable = true)
 |-- VALOR: double (nullable = true)
 |-- CONTAMINANTE: string (nullable = true)



In [63]:
catfh = df.select('FECHA').drop_duplicates().toPandas()
catfh = catfh.sort_values(by=['FECHA']).reset_index(drop=True)
catfh['id'] = catfh.index+1
anclai,anclaf = catfh['id'].min(), catfh['id'].max()
catfh = spark.createDataFrame(catfh)

In [64]:
df = df.join(catfh,['FECHA'],'inner')

In [65]:
df = df.drop('FECHA','HORA')

In [66]:
df.printSchema()

root
 |-- ESTACION: string (nullable = true)
 |-- VALOR: double (nullable = true)
 |-- CONTAMINANTE: string (nullable = true)
 |-- id: long (nullable = true)



In [67]:
vobs = 30
vdes =  1
anclai,anclaf = anclai+vobs-1,anclaf-vdes
anclai, anclaf = 3977,3977+70
anclai,anclaf

(3977, 4047)

In [68]:
df.show(5)

+--------+-----+------------+---+
|ESTACION|VALOR|CONTAMINANTE| id|
+--------+-----+------------+---+
|     VAL|  2.0|         SO2| 50|
|     VAL|  2.0|         SO2| 50|
|     VAL|  1.0|         SO2| 50|
|     VAL|  1.0|         SO2| 50|
|     VAL|  2.0|         SO2| 50|
+--------+-----+------------+---+
only showing top 5 rows



In [69]:
def ing(df,k,ancla):
    aux = df.filter((df['id']>=(ancla-k+1))&(df['id']<=ancla))
    expr = [y(F.col('VALOR')).alias(f'x_{z}_{k}') for y,z in zip([F.min,F.max,F.mean,F.stddev],
                                                             ['minimo','maximo','media','desv'])]
    aux = aux.groupBy('ESTACION').pivot('CONTAMINANTE').agg(*expr).withColumn('ancla',F.lit(ancla))
    return aux

In [70]:
def tgt(df,ancla):
    aux = df.filter((df['id']==(ancla+vdes))&(df['CONTAMINANTE']=='O3'))
    expr = [F.mean(F.col('VALOR')).alias('y')]
    aux = aux.select('ESTACION','VALOR').groupby('ESTACION').agg(*expr)
    aux = aux.withColumn('ancla',F.lit(ancla))
    return aux

In [71]:
um = ['ESTACION','ancla']

In [72]:
step = 10
aux = reduce(lambda x,y:x.join(y,um,'outer'),map(lambda k:ing(df,k,3977),
                                                 range(step,vobs+step,step))).join(tgt(df,3977),um,how='inner') 

In [73]:
tad = reduce(lambda x,y:x.union(y),
             map(lambda ancla:reduce(lambda x,y:x.join(y,um,'outer'),
             map(lambda k:ing(df,k,ancla),
                 range(step,vobs+step,step))).join(tgt(df,ancla),
                                                   um,
                                                   how='inner'),range(anclai,anclaf+1)))

In [74]:
tad.printSchema()

root
 |-- ESTACION: string (nullable = true)
 |-- ancla: integer (nullable = true)
 |-- CO_x_minimo_10: double (nullable = true)
 |-- CO_x_maximo_10: double (nullable = true)
 |-- CO_x_media_10: double (nullable = true)
 |-- CO_x_desv_10: double (nullable = true)
 |-- NO_x_minimo_10: double (nullable = true)
 |-- NO_x_maximo_10: double (nullable = true)
 |-- NO_x_media_10: double (nullable = true)
 |-- NO_x_desv_10: double (nullable = true)
 |-- NO2_x_minimo_10: double (nullable = true)
 |-- NO2_x_maximo_10: double (nullable = true)
 |-- NO2_x_media_10: double (nullable = true)
 |-- NO2_x_desv_10: double (nullable = true)
 |-- NOX_x_minimo_10: double (nullable = true)
 |-- NOX_x_maximo_10: double (nullable = true)
 |-- NOX_x_media_10: double (nullable = true)
 |-- NOX_x_desv_10: double (nullable = true)
 |-- O3_x_minimo_10: double (nullable = true)
 |-- O3_x_maximo_10: double (nullable = true)
 |-- O3_x_media_10: double (nullable = true)
 |-- O3_x_desv_10: double (nullable = true)
 |--

In [75]:
df = tad.toPandas()

In [76]:
df.head()

Unnamed: 0,ESTACION,ancla,CO_x_minimo_10,CO_x_maximo_10,CO_x_media_10,CO_x_desv_10,NO_x_minimo_10,NO_x_maximo_10,NO_x_media_10,NO_x_desv_10,...,PM25_x_desv_30,PMCO_x_minimo_30,PMCO_x_maximo_30,PMCO_x_media_30,PMCO_x_desv_30,SO2_x_minimo_30,SO2_x_maximo_30,SO2_x_media_30,SO2_x_desv_30,y
0,TLI,3977,,,,,,,,,...,,,,,,,,,,
1,LLA,3977,,,,,,,,,...,,,,,,,,,,
2,AJM,3977,,,,,,,,,...,,,,,,,,,,
3,SAG,3977,0.0,2.0,0.418103,0.300389,1.0,178.0,14.831897,23.742709,...,13.210507,4.0,175.0,38.13093,19.44046,1.0,109.0,4.057803,6.807428,14.095238
4,FAR,3977,0.0,1.0,0.281435,0.185928,,,,,...,9.985451,,,,,0.0,46.0,2.893895,4.974938,21.857143


In [77]:
df.to_pickle('tad_aire.pkl')

In [78]:
df.shape

(2556, 111)