In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata
from sklearn.impute import KNNImputer
import seaborn as sns


In [2]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [4]:
df = pd.read_csv('data/vazoes_CA_20_23.csv')

In [5]:
df['timestamp'] = pd.to_datetime(df['Data'])

In [6]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp'],
      dtype='object')

# Imputação Estática de Dados

In [7]:
df['Media'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].mean())

In [8]:
df['Moda'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].mode().iloc[0])

In [9]:
df['Mediana'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].median())

In [10]:
df['LOCF'] = df['Vazao2_CA_15d'].fillna(method='ffill')

In [11]:
df['BOCF'] = df['Vazao2_CA_15d'].fillna(method='bfill')

# Imputação de Dados com Médias Móveis e Interpolação

In [12]:
#df['Media_Movel_3dias'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].rolling(window=3, min_periods=1).mean().shift(1))


In [13]:
#df['Media_Movel_7dias'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].rolling(window=7, min_periods=1).mean().shift(1))


In [14]:
df['Media_Movel_15dias'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].rolling(window=15, min_periods=1).mean().shift(1))


In [15]:
df['InterpolacaoLinear'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].interpolate(method='linear'))


In [16]:
df['InterpolacaoSpline_ordem2'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].interpolate(method='spline', order=2))

In [17]:
df['InterpolacaoSpline_ordem3'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].interpolate(method='spline', order=3))

In [18]:
df['InterpolacaoPolinomial_ordem2'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].interpolate(method='polynomial', order=2))

In [19]:
df['InterpolacaoPolinomial_ordem3'] = df['Vazao2_CA_15d'].fillna(df['Vazao2_CA_15d'].interpolate(method='polynomial', order=3))

# Imputações KNN

In [20]:
df['timestamp_numeric'] = (df['timestamp'] - df['timestamp'].min()) / pd.Timedelta(days=1)

In [21]:
imputer = KNNImputer(n_neighbors=1) 
df['KNN_1k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [22]:
imputer = KNNImputer(n_neighbors=2) 
df['KNN_2k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [23]:
imputer = KNNImputer(n_neighbors=3) 
df['KNN_3k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [24]:
imputer = KNNImputer(n_neighbors=4) 
df['KNN_4k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [25]:
imputer = KNNImputer(n_neighbors=5) 
df['KNN_5k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [26]:
imputer = KNNImputer(n_neighbors=6) 
df['KNN_6k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [27]:
imputer = KNNImputer(n_neighbors=7) 
df['KNN_7k']  = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [28]:
imputer = KNNImputer(n_neighbors=8) 
df['KNN_8k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [29]:
imputer = KNNImputer(n_neighbors=9) 
df['KNN_9k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [30]:
imputer = KNNImputer(n_neighbors=10) 
df['KNN_10k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [31]:
imputer = KNNImputer(n_neighbors=15) 
df['KNN_15k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [32]:
imputer = KNNImputer(n_neighbors=30) 
df['KNN_30k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [33]:
imputer = KNNImputer(n_neighbors=90) 
df['KNN_90k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [34]:
imputer = KNNImputer(n_neighbors=180) 
df['KNN_180k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [35]:
imputer = KNNImputer(n_neighbors=365) 
df['KNN_365k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [36]:
imputer = KNNImputer(n_neighbors=1095) 
df['KNN_1095k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

In [37]:
imputer = KNNImputer(n_neighbors=1825) 
df['KNN_1825k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_15d', 'timestamp_numeric']])))

# Métricas


In [38]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp', 'Media', 'Moda',
       'Mediana', 'LOCF', 'BOCF', 'Media_Movel_15dias', 'InterpolacaoLinear',
       'InterpolacaoSpline_ordem2', 'InterpolacaoSpline_ordem3',
       'InterpolacaoPolinomial_ordem2', 'InterpolacaoPolinomial_ordem3',
       'timestamp_numeric', 'KNN_1k', 'KNN_2k', 'KNN_3k', 'KNN_4k', 'KNN_5k',
       'KNN_6k', 'KNN_7k', 'KNN_8k', 'KNN_9k', 'KNN_10k', 'KNN_15k', 'KNN_30k',
       'KNN_90k', 'KNN_180k', 'KNN_365k', 'KNN_1095k', 'KNN_1825k'],
      dtype='object')

In [39]:
colunas = df.columns
nulos = df[df['Vazao2_CA_15d'].isnull()].index

In [40]:
nulos

Int64Index([ 127,  128,  129,  130,  131,  132,  133,  134,  135,  136,  137,
             138,  139,  140,  141,  865,  866,  867,  868,  869,  870,  871,
             872,  873,  874,  875,  876,  877,  878,  879, 1579, 1580, 1581,
            1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592,
            1593, 3682, 3683, 3684, 3685, 3686, 3687, 3688, 3689, 3690, 3691,
            3692, 3693, 3694, 3695, 3696, 5826, 5827, 5828, 5829, 5830, 5831,
            5832, 5833, 5834, 5835, 5836, 5837, 5838, 5839, 5840, 8255, 8256,
            8257, 8258, 8259, 8260, 8261, 8262, 8263, 8264, 8265, 8266, 8267,
            8268, 8269],
           dtype='int64')

In [41]:
mae = []
rmse = []
corr = []
mape = []
for col in colunas[12:]:
    print('------------------')
    print('Método: ', col)
    
    dict_metrics = calcula_metricas(df['Vazao_CA'][nulos], df[col][nulos])  
    mae.append(dict_metrics['mae'])
    rmse.append(dict_metrics['rmse'])
    corr.append(dict_metrics['corr'])
    mape.append(dict_metrics['mape'])

------------------
Método:  Media
Mean Absolute Error (MAE): 129.4789
Root Mean Squared Error (RMSE): 256.6013
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 94.9800
------------------
Método:  Moda
Mean Absolute Error (MAE): 142.8444
Root Mean Squared Error (RMSE): 293.6671
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 60.6368
------------------
Método:  Mediana
Mean Absolute Error (MAE): 108.6667
Root Mean Squared Error (RMSE): 271.5312
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 42.8077
------------------
Método:  LOCF
Mean Absolute Error (MAE): 146.1111
Root Mean Squared Error (RMSE): 233.0103
Pearson Correlation: 0.4521
Mean Absolute Percentual Error (MAPE): 107.5601
------------------
Método:  BOCF
Mean Absolute Error (MAE): 116.6889
Root Mean Squared Error (RMSE): 263.8587
Pearson Correlation: 0.1344
Mean Absolute Percentual Error (MAPE): 59.1008
------------------
Método:  Media_Movel_15dias
Mean Absolute Error (MAE): 160.6



In [42]:
metodos = colunas[12:].to_list()

In [43]:
mae.append(109.0216)
rmse.append(223.1329)
corr.append(0.5275505)
mape.append(61.09876)
metodos.append('kalman_struct')

In [44]:
mae.append(134.5402)
rmse.append(280.6771)
corr.append(0.2406752)
mape.append( 61.94842)
metodos.append('kalman_arima')

In [45]:
metricas = pd.DataFrame({'mae' : mae, 'rmse' : rmse, 'corr' : corr, 'mape' : mape, 'metodos' : metodos})


In [46]:
metricas.sort_values(by='mae').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
14,108.214815,199.912611,0.632327,62.078692,KNN_3k
2,108.666667,271.531173,,42.807686,Mediana
29,109.0216,223.1329,0.527551,61.09876,kalman_struct
6,109.025,223.17834,0.52732,61.099102,InterpolacaoLinear
21,109.295556,200.106322,0.637238,66.827596,KNN_10k
12,110.355556,211.675275,0.581493,63.781879,KNN_1k
20,111.667901,204.712955,0.606421,65.435657,KNN_9k
13,112.455556,208.029178,0.595852,63.324628,KNN_2k
19,112.709722,203.518731,0.609062,65.396879,KNN_8k
15,113.147222,205.41128,0.60261,63.369573,KNN_4k


In [56]:
metricas.sort_values(by='rmse').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
14,108.214815,199.912611,0.632327,62.078692,KNN_3k
21,109.295556,200.106322,0.637238,66.827596,KNN_10k
19,112.709722,203.518731,0.609062,65.396879,KNN_8k
16,116.202222,204.182849,0.608441,66.396427,KNN_5k
20,111.667901,204.712955,0.606421,65.435657,KNN_9k
15,113.147222,205.41128,0.60261,63.369573,KNN_4k
13,112.455556,208.029178,0.595852,63.324628,KNN_2k
12,110.355556,211.675275,0.581493,63.781879,KNN_1k
18,119.452381,218.484926,0.538656,69.045277,KNN_7k
22,117.820741,219.207574,0.534747,84.827475,KNN_15k


In [54]:
metricas.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
21,109.295556,200.106322,0.637238,66.827596,KNN_10k
14,108.214815,199.912611,0.632327,62.078692,KNN_3k
19,112.709722,203.518731,0.609062,65.396879,KNN_8k
16,116.202222,204.182849,0.608441,66.396427,KNN_5k
20,111.667901,204.712955,0.606421,65.435657,KNN_9k
15,113.147222,205.41128,0.60261,63.369573,KNN_4k
13,112.455556,208.029178,0.595852,63.324628,KNN_2k
12,110.355556,211.675275,0.581493,63.781879,KNN_1k
18,119.452381,218.484926,0.538656,69.045277,KNN_7k
22,117.820741,219.207574,0.534747,84.827475,KNN_15k


In [49]:
metricas.sort_values(by='mape').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
2,108.666667,271.531173,,42.807686,Mediana
4,116.688889,263.858716,0.134411,59.100819,BOCF
1,142.844444,293.667121,,60.636776,Moda
29,109.0216,223.1329,0.527551,61.09876,kalman_struct
6,109.025,223.17834,0.52732,61.099102,InterpolacaoLinear
30,134.5402,280.6771,0.240675,61.94842,kalman_arima
14,108.214815,199.912611,0.632327,62.078692,KNN_3k
13,112.455556,208.029178,0.595852,63.324628,KNN_2k
15,113.147222,205.41128,0.60261,63.369573,KNN_4k
12,110.355556,211.675275,0.581493,63.781879,KNN_1k
