# Etapa 03 Evaluación de estacionalidad y causalidad

Se requiere tener series de tiempo sin faltantes para evaluar estas características

Para ello se puede emplear la base generada con el relleno de datos faltantes por perfiles horarios o la serie rellenada con la modelación de Prophet.

In [7]:
#Carga paquetes generales
import numpy as np
import pandas as pd


# visual libraries
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import gridspec
import seaborn as sns
import plotly.express as px
import os
from mpl_toolkits.mplot3d import Axes3D 
plt.style.use('ggplot')
from seaborn import kdeplot

# sklearn
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


# pruebas estadisticas de estacionalidad y causalidad
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import grangercausalitytests



In [2]:
#Carga Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Define la ruta raiz del proyecto
ruta="/content/drive/MyDrive/AIre/BD/Sitios/"

In [5]:
#Carga datos de una estación con relleno de faltantes
estac='MER'
df=pd.read_csv(ruta + estac +'_rell00.csv')
df.date2=pd.to_datetime(df['date2'],format="%Y-%m-%d %H:%M:%S")
df=df.set_index('date2')
df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 41107 entries, 2015-01-01 00:00:00 to 2019-09-30 00:00:00
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        41107 non-null  object 
 1   Date        41107 non-null  object 
 2   Hour        41107 non-null  int64  
 3   year        41107 non-null  int64  
 4   id_station  41107 non-null  object 
 5   CO          41107 non-null  float64
 6   NO          41107 non-null  float64
 7   NO2         41107 non-null  float64
 8   NOX         41107 non-null  float64
 9   O3          41107 non-null  float64
 10  PA          41107 non-null  float64
 11  PBa         41107 non-null  float64
 12  PM10        41107 non-null  float64
 13  PM2.5       41107 non-null  float64
 14  PMCO        41107 non-null  float64
 15  RH          41107 non-null  float64
 16  SO2         41107 non-null  float64
 17  TMP         41107 non-null  float64
 18  WDR         41107 non-null  flo

# Evalua si la series son estacionarias y presentan causalidad

dada la variabilidad entre las unidades de medición se standarizan las variables

In [8]:
#Funcion para estandarizar datos

class MyScaler(TransformerMixin, BaseEstimator):            #función para estandarizar #https://stackoverflow.com/questions/59434397/standardscaler-valueerror-operands-could-not-be-broadcast-together-with-shapes
    def fit(self, X, y=None):
        self.means_ = X.mean(axis=0)
        self.std_dev_ = X.std(axis=0)
        return self
    def transform(self, X, y=None):
        return (X - self.means_[:X.shape[1]]) / self.std_dev_[:X.shape[1]]

In [13]:
#Función para evaluar estacionalidad
#Null Hypothesis - Series possesses a unit root and hence is not stationary
#Alternate Hypothesis - Series is stationary

def adfuller_test(variable):
  result = adfuller(df[variable])
  print(f'Test Statistics: {result[0]}')
  print(f'p-value: {result[1]}')
  print(f'Critial Values: {result[4]}')
  print(variable)
  if result[1] > 0.05 :
    print('Series is not Stationary')
  else:
    print('Series is Stationary')
    print('')

In [14]:
#Función para evaluar causalidad entre X y Y
# H0: Xt does not granger causes Yt
# H1: Xt granger causes Yt  

def grangercausality_tests(df, variable1, variable2, max_lags):
  results=grangercausalitytests(df[[variable1,variable2]], max_lags, verbose=False)
  p_values=[round(results[i+1][0]['ssr_ftest'][1],4) for i in range(max_lags)]
  print('p values por lag entre {} y {}'.format(variable1,variable2))
  print(p_values)
  print('')



In [15]:
#Estandariza la base

df_e=df.copy()
scaler = MyScaler()                                               #función para estandarizar

#solo se transforman los contaminantes y meteorolgía
df_e[df_e.columns[~df_e.columns.isin(['date','Date','Hour', 'Month', 'year', 'id_station'])]]=scaler.fit_transform(df_e[df_e.columns[~df_e.columns.isin(['date','Date','Hour', 'Month', 'year', 'id_station'])]])
   

In [21]:
#Correlación entre parámetros

df_corr=df_e[df_e.columns[~df_e.columns.isin(['date','Date','Hour', 'Month', 'year', 'id_station'])]].corr(method='spearman')

#listado de variables con correlación mayor o igual que 0.25
var_corr=df_corr[df_corr['PM2.5'].abs() >= 0.25]['PM2.5'].sort_values(ascending=False).index

#listado de variables con correlación menor que 0.25
var_no_corr=df.columns[~df.columns.isin(var_corr)][7:]

print("Variables correlacionadas (>=0.25): ",var_corr)
print("Variables no correlacionadas (<0.25): ",var_no_corr)
print("")
print("Matriz de correlación")
df_corr

Variables correlacionadas (>=0.25):  Index(['PM2.5', 'PM10', 'PMCO', 'SO2', 'NO2', 'CO', 'NOX', 'NO'], dtype='object')
Variables no correlacionadas (<0.25):  Index(['PBa', 'RH', 'TMP', 'WDR', 'WSP', 'Month', 'X', 'Y'], dtype='object')

Matriz de correlación


Unnamed: 0,CO,NO,NO2,NOX,O3,PA,PBa,PM10,PM2.5,PMCO,RH,SO2,TMP,WDR,WSP,X,Y
CO,1.0,0.72361,0.693004,0.800328,-0.395949,0.035855,0.279012,0.408681,0.411255,0.326043,0.186128,0.395769,-0.316522,-0.056435,-0.207349,0.017266,-0.223279
NO,0.72361,1.0,0.62533,0.915562,-0.63986,0.211991,0.342131,0.290342,0.266864,0.275269,0.342723,0.433687,-0.512859,0.018806,-0.218953,-0.099137,-0.110658
NO2,0.693004,0.62533,1.0,0.856476,-0.407699,0.158868,0.299207,0.438195,0.428727,0.36719,0.156263,0.420157,-0.331907,-0.020886,-0.16956,-0.017061,-0.15684
NOX,0.800328,0.915562,0.856476,1.0,-0.618064,0.208674,0.355827,0.392987,0.374388,0.349356,0.320244,0.477363,-0.505176,-0.010679,-0.227938,-0.051418,-0.163892
O3,-0.395949,-0.63986,-0.407699,-0.618064,1.0,-0.26535,-0.355249,0.07994,0.075146,0.042044,-0.661987,-0.208248,0.791341,-0.115395,0.195652,0.192493,0.147165
PA,0.035855,0.211991,0.158868,0.208674,-0.26535,1.0,0.641698,-0.018869,0.032631,-0.055211,0.393257,0.1459,-0.421009,0.079031,0.002237,-0.142043,0.16198
PBa,0.279012,0.342131,0.299207,0.355827,-0.355249,0.641698,1.0,0.047078,0.112186,-0.016913,0.36364,0.200965,-0.447104,0.056607,-0.070711,-0.101903,-0.003533
PM10,0.408681,0.290342,0.438195,0.392987,0.07994,-0.018869,0.047078,1.0,0.873341,0.862213,-0.2619,0.463282,0.060829,-0.14676,-0.039204,0.176273,-0.065525
PM2.5,0.411255,0.266864,0.428727,0.374388,0.075146,0.032631,0.112186,0.873341,1.0,0.543443,-0.099771,0.467222,0.004804,-0.143672,-0.090026,0.168621,-0.095235
PMCO,0.326043,0.275269,0.36719,0.349356,0.042044,-0.055211,-0.016913,0.862213,0.543443,1.0,-0.356693,0.370214,0.085202,-0.113288,0.007556,0.133964,-0.025654


In [24]:

#Evaluar estacionalidad en variables con mayor correlación
for i in var_corr:
  adfuller_test(i)


Test Statistics: -16.865990646433577
p-value: 1.0903741912637672e-29
Critial Values: {'1%': -3.430509299172611, '5%': -2.8616104066245764, '10%': -2.5668074751765704}
PM2.5
Series is Stationary

Test Statistics: -14.185462467663573
p-value: 1.8963047063639885e-26
Critial Values: {'1%': -3.430509295292133, '5%': -2.8616104049095394, '10%': -2.5668074742637033}
PM10
Series is Stationary

Test Statistics: -12.71356405243925
p-value: 1.017839685670432e-23
Critial Values: {'1%': -3.430509299172611, '5%': -2.8616104066245764, '10%': -2.5668074751765704}
PMCO
Series is Stationary

Test Statistics: -18.565113754106388
p-value: 2.0854047398012933e-30
Critial Values: {'1%': -3.430509299172611, '5%': -2.8616104066245764, '10%': -2.5668074751765704}
SO2
Series is Stationary

Test Statistics: -17.41901104423733
p-value: 4.8377984016005705e-30
Critial Values: {'1%': -3.430509306934134, '5%': -2.8616104100549014, '10%': -2.566807477002438}
NO2
Series is Stationary

Test Statistics: -13.28670867919054

In [25]:

#Evaluar causalidad entre PM25 y variables con mayor correlación
for j in var_corr[1:]:
  grangercausality_tests(df, 'PM2.5', j, 24)
for j in var_corr[1:]:
  grangercausality_tests(df, j , 'PM2.5', 24)



p values por lag entre PM2.5 y PM10
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y PMCO
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y SO2
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y NO2
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y CO
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y NOX
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y NO
[0.0, 0.0, 0.0, 0.0, 0.

In [26]:

#Evaluar estacionalidad en variables con menor correlación
for i in var_no_corr:
  adfuller_test(i)


Test Statistics: -13.389790034978997
p-value: 4.792753252003455e-25
Critial Values: {'1%': -3.430509306934134, '5%': -2.8616104100549014, '10%': -2.566807477002438}
PBa
Series is Stationary

Test Statistics: -12.345746343597801
p-value: 5.978054141778128e-23
Critial Values: {'1%': -3.430509306934134, '5%': -2.8616104100549014, '10%': -2.566807477002438}
RH
Series is Stationary

Test Statistics: -10.955446972307124
p-value: 8.585534597665999e-20
Critial Values: {'1%': -3.430509306934134, '5%': -2.8616104100549014, '10%': -2.566807477002438}
TMP
Series is Stationary

Test Statistics: -20.426733111426607
p-value: 0.0
Critial Values: {'1%': -3.4305092797721124, '5%': -2.861610398050226, '10%': -2.5668074706126798}
WDR
Series is Stationary

Test Statistics: -18.345114234641777
p-value: 2.2421279160776707e-30
Critial Values: {'1%': -3.430509306934134, '5%': -2.8616104100549014, '10%': -2.566807477002438}
WSP
Series is Stationary

Test Statistics: -3.5594177497421575
p-value: 0.00658598640100

In [27]:

#Evaluar causalidad entrePM2.5 y variables con menor correlación  
for j in var_no_corr:
  grangercausality_tests(df, 'PM2.5', j, 24)
for j in var_no_corr:
  grangercausality_tests(df, j, 'PM2.5', 24)      

p values por lag entre PM2.5 y PBa
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y RH
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y TMP
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y WDR
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y WSP
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y Month
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

p values por lag entre PM2.5 y X
[0.0, 0.0011, 0.0057, 0.