In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata
from sklearn.impute import KNNImputer
import seaborn as sns


In [2]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [4]:
df = pd.read_csv('data/vazoes_CA_20_23.csv')

In [5]:
df['timestamp'] = pd.to_datetime(df['Data'])

In [6]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp'],
      dtype='object')

# Imputação Estática de Dados

In [7]:
df['Media'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].mean())

In [8]:
df['Moda'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].mode().iloc[0])

In [9]:
df['Mediana'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].median())

In [10]:
df['LOCF'] = df['Vazao1_CA_30d'].fillna(method='ffill')

In [11]:
df['BOCF'] = df['Vazao1_CA_30d'].fillna(method='bfill')

# Imputação de Dados com Médias Móveis e Interpolação

In [12]:
#df['Media_Movel_3dias'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].rolling(window=3, min_periods=1).mean().shift(1))


In [13]:
#df['Media_Movel_7dias'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].rolling(window=7, min_periods=1).mean().shift(1))


In [14]:
#df['Media_Movel_15dias'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].rolling(window=15, min_periods=1).mean().shift(1))


In [15]:
df['InterpolacaoLinear'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].interpolate(method='linear'))


In [16]:
df['InterpolacaoSpline_ordem2'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].interpolate(method='spline', order=2))

In [17]:
df['InterpolacaoSpline_ordem3'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].interpolate(method='spline', order=3))

In [18]:
df['InterpolacaoPolinomial_ordem2'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].interpolate(method='polynomial', order=2))

In [19]:
df['InterpolacaoPolinomial_ordem3'] = df['Vazao1_CA_30d'].fillna(df['Vazao1_CA_30d'].interpolate(method='polynomial', order=3))

# Imputações KNN

In [20]:
df['timestamp_numeric'] = (df['timestamp'] - df['timestamp'].min()) / pd.Timedelta(days=1)

In [21]:
imputer = KNNImputer(n_neighbors=1) 
df['KNN_1k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [22]:
imputer = KNNImputer(n_neighbors=2) 
df['KNN_2k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [23]:
imputer = KNNImputer(n_neighbors=3) 
df['KNN_3k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [24]:
imputer = KNNImputer(n_neighbors=4) 
df['KNN_4k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [25]:
imputer = KNNImputer(n_neighbors=5) 
df['KNN_5k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [26]:
imputer = KNNImputer(n_neighbors=6) 
df['KNN_6k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [27]:
imputer = KNNImputer(n_neighbors=7) 
df['KNN_7k']  = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [28]:
imputer = KNNImputer(n_neighbors=8) 
df['KNN_8k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [29]:
imputer = KNNImputer(n_neighbors=9) 
df['KNN_9k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [30]:
imputer = KNNImputer(n_neighbors=10) 
df['KNN_10k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [31]:
imputer = KNNImputer(n_neighbors=15) 
df['KNN_15k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [32]:
imputer = KNNImputer(n_neighbors=30) 
df['KNN_30k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [33]:
imputer = KNNImputer(n_neighbors=90) 
df['KNN_90k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [34]:
imputer = KNNImputer(n_neighbors=180) 
df['KNN_180k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [35]:
imputer = KNNImputer(n_neighbors=365) 
df['KNN_365k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [36]:
imputer = KNNImputer(n_neighbors=1095) 
df['KNN_1095k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

In [37]:
imputer = KNNImputer(n_neighbors=1825) 
df['KNN_1825k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_30d', 'timestamp_numeric']])))

# Métricas


In [38]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp', 'Media', 'Moda',
       'Mediana', 'LOCF', 'BOCF', 'InterpolacaoLinear',
       'InterpolacaoSpline_ordem2', 'InterpolacaoSpline_ordem3',
       'InterpolacaoPolinomial_ordem2', 'InterpolacaoPolinomial_ordem3',
       'timestamp_numeric', 'KNN_1k', 'KNN_2k', 'KNN_3k', 'KNN_4k', 'KNN_5k',
       'KNN_6k', 'KNN_7k', 'KNN_8k', 'KNN_9k', 'KNN_10k', 'KNN_15k', 'KNN_30k',
       'KNN_90k', 'KNN_180k', 'KNN_365k', 'KNN_1095k', 'KNN_1825k'],
      dtype='object')

In [39]:
colunas = df.columns
nulos = df[df['Vazao1_CA_30d'].isnull()].index

In [40]:
nulos

Int64Index([3635, 3636, 3637, 3638, 3639, 3640, 3641, 3642, 3643, 3644, 3645,
            3646, 3647, 3648, 3649, 3650, 3651, 3652, 3653, 3654, 3655, 3656,
            3657, 3658, 3659, 3660, 3661, 3662, 3663, 3664, 4288, 4289, 4290,
            4291, 4292, 4293, 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301,
            4302, 4303, 4304, 4305, 4306, 4307, 4308, 4309, 4310, 4311, 4312,
            4313, 4314, 4315, 4316, 4317, 5580, 5581, 5582, 5583, 5584, 5585,
            5586, 5587, 5588, 5589, 5590, 5591, 5592, 5593, 5594, 5595, 5596,
            5597, 5598, 5599, 5600, 5601, 5602, 5603, 5604, 5605, 5606, 5607,
            5608, 5609],
           dtype='int64')

In [41]:
mae = []
rmse = []
corr = []
mape = []
for col in colunas[12:]:
    print('------------------')
    print('Método: ', col)
    
    dict_metrics = calcula_metricas(df['Vazao_CA'][nulos], df[col][nulos])  
    mae.append(dict_metrics['mae'])
    rmse.append(dict_metrics['rmse'])
    corr.append(dict_metrics['corr'])
    mape.append(dict_metrics['mape'])

------------------
Método:  Media
Mean Absolute Error (MAE): 169.2443
Root Mean Squared Error (RMSE): 271.4835
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 249.4077
------------------
Método:  Moda
Mean Absolute Error (MAE): 181.4889
Root Mean Squared Error (RMSE): 320.0556
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 74.0201
------------------
Método:  Mediana
Mean Absolute Error (MAE): 164.2222
Root Mean Squared Error (RMSE): 294.0905
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 130.2007
------------------
Método:  LOCF
Mean Absolute Error (MAE): 149.3778
Root Mean Squared Error (RMSE): 281.3447
Pearson Correlation: 0.2338
Mean Absolute Percentual Error (MAPE): 57.6322
------------------
Método:  BOCF
Mean Absolute Error (MAE): 129.4889
Root Mean Squared Error (RMSE): 255.3878
Pearson Correlation: 0.4257
Mean Absolute Percentual Error (MAPE): 53.2467
------------------
Método:  InterpolacaoLinear
Mean Absolute Error (MAE): 121.



In [42]:
metodos = colunas[12:].to_list()

In [43]:
mae.append(121.9127)
rmse.append(256.559)
corr.append(0.4573518)
mape.append(43.56375)
metodos.append('kalman_struct')

In [44]:
mae.append(129.4882)
rmse.append(261.6962)
corr.append(0.3755889)
mape.append(53.82489)
metodos.append('kalman_arima')

In [45]:
metricas = pd.DataFrame({'mae' : mae, 'rmse' : rmse, 'corr' : corr, 'mape' : mape, 'metodos' : metodos})


In [46]:
metricas.sort_values(by='mae').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
28,121.9127,256.559,0.457352,43.56375,kalman_struct
5,121.913978,256.559503,0.457307,43.555129,InterpolacaoLinear
29,129.4882,261.6962,0.375589,53.82489,kalman_arima
4,129.488889,255.38781,0.425749,53.246693,BOCF
23,134.651235,243.276444,0.472699,134.179587,KNN_90k
12,140.85,270.664671,0.31569,60.117357,KNN_2k
13,141.548148,263.807983,0.359342,63.641005,KNN_3k
11,143.333333,274.824227,0.287153,54.837402,KNN_1k
3,149.377778,281.344708,0.233765,57.632188,LOCF
24,155.990556,245.905444,0.437552,188.934391,KNN_180k


In [47]:
metricas.sort_values(by='rmse').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
23,134.651235,243.276444,0.472699,134.179587,KNN_90k
24,155.990556,245.905444,0.437552,188.934391,KNN_180k
25,184.038174,255.185214,0.449081,280.404415,KNN_365k
4,129.488889,255.38781,0.425749,53.246693,BOCF
28,121.9127,256.559,0.457352,43.56375,kalman_struct
5,121.913978,256.559503,0.457307,43.555129,InterpolacaoLinear
26,170.329031,258.907865,0.405114,238.319529,KNN_1095k
29,129.4882,261.6962,0.375589,53.82489,kalman_arima
13,141.548148,263.807983,0.359342,63.641005,KNN_3k
12,140.85,270.664671,0.31569,60.117357,KNN_2k


In [48]:
metricas.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
23,134.651235,243.276444,0.472699,134.179587,KNN_90k
28,121.9127,256.559,0.457352,43.56375,kalman_struct
5,121.913978,256.559503,0.457307,43.555129,InterpolacaoLinear
25,184.038174,255.185214,0.449081,280.404415,KNN_365k
24,155.990556,245.905444,0.437552,188.934391,KNN_180k
4,129.488889,255.38781,0.425749,53.246693,BOCF
26,170.329031,258.907865,0.405114,238.319529,KNN_1095k
22,157.052963,277.316305,0.38929,119.356282,KNN_30k
29,129.4882,261.6962,0.375589,53.82489,kalman_arima
13,141.548148,263.807983,0.359342,63.641005,KNN_3k


In [49]:
metricas.sort_values(by='mape').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
5,121.913978,256.559503,0.457307,43.555129,InterpolacaoLinear
28,121.9127,256.559,0.457352,43.56375,kalman_struct
4,129.488889,255.38781,0.425749,53.246693,BOCF
29,129.4882,261.6962,0.375589,53.82489,kalman_arima
11,143.333333,274.824227,0.287153,54.837402,KNN_1k
3,149.377778,281.344708,0.233765,57.632188,LOCF
12,140.85,270.664671,0.31569,60.117357,KNN_2k
13,141.548148,263.807983,0.359342,63.641005,KNN_3k
1,181.488889,320.055551,,74.020074,Moda
14,187.144444,315.406026,0.205785,94.675387,KNN_4k
