In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata
from sklearn.impute import KNNImputer
import seaborn as sns


In [2]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [4]:
df = pd.read_csv('data/vazoes_CA_20_23.csv')

In [5]:
df['timestamp'] = pd.to_datetime(df['Data'])

In [6]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp'],
      dtype='object')

# Imputação Estática de Dados

In [7]:
df['Media'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].mean())

In [8]:
df['Moda'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].mode().iloc[0])

In [9]:
df['Mediana'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].median())

In [10]:
df['LOCF'] = df['Vazao2_CA_1d'].fillna(method='ffill')

In [11]:
df['BOCF'] = df['Vazao2_CA_1d'].fillna(method='bfill')

# Imputação de Dados com Médias Móveis e Interpolação

In [12]:
df['Media_Movel_3dias'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].rolling(window=3, min_periods=1).mean().shift(1))


In [13]:
df['Media_Movel_7dias'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].rolling(window=7, min_periods=1).mean().shift(1))


In [14]:
df['Media_Movel_15dias'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].rolling(window=15, min_periods=1).mean().shift(1))


In [15]:
df['InterpolacaoLinear'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].interpolate(method='linear'))


In [16]:
df['InterpolacaoSpline_ordem2'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].interpolate(method='spline', order=2))

In [17]:
df['InterpolacaoSpline_ordem3'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].interpolate(method='spline', order=3))

In [18]:
df['InterpolacaoPolinomial_ordem2'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].interpolate(method='polynomial', order=2))

In [19]:
df['InterpolacaoPolinomial_ordem3'] = df['Vazao2_CA_1d'].fillna(df['Vazao2_CA_1d'].interpolate(method='polynomial', order=3))

# Imputações KNN

In [20]:
df['timestamp_numeric'] = (df['timestamp'] - df['timestamp'].min()) / pd.Timedelta(days=1)

In [21]:
imputer = KNNImputer(n_neighbors=1) 
df['KNN_1k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [22]:
imputer = KNNImputer(n_neighbors=2) 
df['KNN_2k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [23]:
imputer = KNNImputer(n_neighbors=3) 
df['KNN_3k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [24]:
imputer = KNNImputer(n_neighbors=4) 
df['KNN_4k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [25]:
imputer = KNNImputer(n_neighbors=5) 
df['KNN_5k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [26]:
imputer = KNNImputer(n_neighbors=6) 
df['KNN_6k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [27]:
imputer = KNNImputer(n_neighbors=7) 
df['KNN_7k']  = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [28]:
imputer = KNNImputer(n_neighbors=8) 
df['KNN_8k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [29]:
imputer = KNNImputer(n_neighbors=9) 
df['KNN_9k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [30]:
imputer = KNNImputer(n_neighbors=10) 
df['KNN_10k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [31]:
imputer = KNNImputer(n_neighbors=15) 
df['KNN_15k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [32]:
imputer = KNNImputer(n_neighbors=30) 
df['KNN_30k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [33]:
imputer = KNNImputer(n_neighbors=90) 
df['KNN_90k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [34]:
imputer = KNNImputer(n_neighbors=180) 
df['KNN_180k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [35]:
imputer = KNNImputer(n_neighbors=365) 
df['KNN_365k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [36]:
imputer = KNNImputer(n_neighbors=1095) 
df['KNN_1095k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

In [37]:
imputer = KNNImputer(n_neighbors=1825) 
df['KNN_1825k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_1d', 'timestamp_numeric']])))

# Métricas


In [38]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp', 'Media', 'Moda',
       'Mediana', 'LOCF', 'BOCF', 'Media_Movel_3dias', 'Media_Movel_7dias',
       'Media_Movel_15dias', 'InterpolacaoLinear', 'InterpolacaoSpline_ordem2',
       'InterpolacaoSpline_ordem3', 'InterpolacaoPolinomial_ordem2',
       'InterpolacaoPolinomial_ordem3', 'timestamp_numeric', 'KNN_1k',
       'KNN_2k', 'KNN_3k', 'KNN_4k', 'KNN_5k', 'KNN_6k', 'KNN_7k', 'KNN_8k',
       'KNN_9k', 'KNN_10k', 'KNN_15k', 'KNN_30k', 'KNN_90k', 'KNN_180k',
       'KNN_365k', 'KNN_1095k', 'KNN_1825k'],
      dtype='object')

In [39]:
colunas = df.columns
nulos = df[df['Vazao2_CA_1d'].isnull()].index

In [41]:
mae = []
rmse = []
corr = []
mape = []
for col in colunas[12:]:
    print('------------------')
    print('Método: ', col)
    
    dict_metrics = calcula_metricas(df['Vazao_CA'][nulos], df[col][nulos])  
    mae.append(dict_metrics['mae'])
    rmse.append(dict_metrics['rmse'])
    corr.append(dict_metrics['corr'])
    mape.append(dict_metrics['mape'])

------------------
Método:  Media
Mean Absolute Error (MAE): 159.3092
Root Mean Squared Error (RMSE): 424.8391
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 258.2716
------------------
Método:  Moda
Mean Absolute Error (MAE): 135.5700
Root Mean Squared Error (RMSE): 444.0783
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 64.7059
------------------
Método:  Mediana
Mean Absolute Error (MAE): 126.7900
Root Mean Squared Error (RMSE): 431.2234
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 118.9377
------------------
Método:  LOCF
Mean Absolute Error (MAE): 65.7900
Root Mean Squared Error (RMSE): 266.5662
Pearson Correlation: 0.7893
Mean Absolute Percentual Error (MAPE): 23.3191
------------------
Método:  BOCF
Mean Absolute Error (MAE): 54.9900
Root Mean Squared Error (RMSE): 217.9721
Pearson Correlation: 0.9195
Mean Absolute Percentual Error (MAPE): 23.5082
------------------
Método:  Media_Movel_3dias
Mean Absolute Error (MAE): 90.9300



In [42]:
metodos = colunas[12:].to_list()

In [43]:
mae.append(46.77236)
rmse.append(225.2246)
corr.append(0.9082725)
mape.append(16.14101)
metodos.append('kalman_struct')

In [44]:
mae.append(43.8381)
rmse.append(211.16)
corr.append(0.9315108)
mape.append(16.60978)
metodos.append('kalman_arima')

In [45]:
metricas = pd.DataFrame({'mae' : mae, 'rmse' : rmse, 'corr' : corr, 'mape' : mape, 'metodos' : metodos})


In [46]:
metricas.sort_values(by='mae').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
9,41.880677,186.719402,0.943062,15.020281,InterpolacaoSpline_ordem2
11,41.94034,186.59895,0.943156,15.170232,InterpolacaoPolinomial_ordem2
12,42.510599,181.368753,0.947591,17.11125,InterpolacaoPolinomial_ordem3
10,42.575414,181.396388,0.947551,17.424848,InterpolacaoSpline_ordem3
32,43.8381,211.16,0.931511,16.60978,kalman_arima
8,46.52,224.081326,0.909426,16.006915,InterpolacaoLinear
31,46.77236,225.2246,0.908273,16.14101,kalman_struct
15,47.215,224.233712,0.908751,16.391106,KNN_2k
4,54.99,217.972085,0.919491,23.508242,BOCF
16,55.83,270.330286,0.858203,20.907765,KNN_3k


In [47]:
metricas.sort_values(by='rmse').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
12,42.510599,181.368753,0.947591,17.11125,InterpolacaoPolinomial_ordem3
10,42.575414,181.396388,0.947551,17.424848,InterpolacaoSpline_ordem3
11,41.94034,186.59895,0.943156,15.170232,InterpolacaoPolinomial_ordem2
9,41.880677,186.719402,0.943062,15.020281,InterpolacaoSpline_ordem2
32,43.8381,211.16,0.931511,16.60978,kalman_arima
4,54.99,217.972085,0.919491,23.508242,BOCF
8,46.52,224.081326,0.909426,16.006915,InterpolacaoLinear
15,47.215,224.233712,0.908751,16.391106,KNN_2k
31,46.77236,225.2246,0.908273,16.14101,kalman_struct
14,65.23,266.467428,0.789683,22.991631,KNN_1k


In [48]:
metricas.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
12,42.510599,181.368753,0.947591,17.11125,InterpolacaoPolinomial_ordem3
10,42.575414,181.396388,0.947551,17.424848,InterpolacaoSpline_ordem3
11,41.94034,186.59895,0.943156,15.170232,InterpolacaoPolinomial_ordem2
9,41.880677,186.719402,0.943062,15.020281,InterpolacaoSpline_ordem2
32,43.8381,211.16,0.931511,16.60978,kalman_arima
4,54.99,217.972085,0.919491,23.508242,BOCF
8,46.52,224.081326,0.909426,16.006915,InterpolacaoLinear
15,47.215,224.233712,0.908751,16.391106,KNN_2k
31,46.77236,225.2246,0.908273,16.14101,kalman_struct
16,55.83,270.330286,0.858203,20.907765,KNN_3k


In [49]:
metricas.sort_values(by='mape').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
9,41.880677,186.719402,0.943062,15.020281,InterpolacaoSpline_ordem2
11,41.94034,186.59895,0.943156,15.170232,InterpolacaoPolinomial_ordem2
8,46.52,224.081326,0.909426,16.006915,InterpolacaoLinear
31,46.77236,225.2246,0.908273,16.14101,kalman_struct
15,47.215,224.233712,0.908751,16.391106,KNN_2k
32,43.8381,211.16,0.931511,16.60978,kalman_arima
12,42.510599,181.368753,0.947591,17.11125,InterpolacaoPolinomial_ordem3
10,42.575414,181.396388,0.947551,17.424848,InterpolacaoSpline_ordem3
16,55.83,270.330286,0.858203,20.907765,KNN_3k
17,60.1675,292.074703,0.798686,22.747529,KNN_4k
