In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata
from sklearn.impute import KNNImputer
import seaborn as sns


In [3]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [5]:
df = pd.read_csv('data/vazoes_CA_20_23.csv')

In [6]:
df['timestamp'] = pd.to_datetime(df['Data'])

In [7]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp'],
      dtype='object')

# Imputação Estática de Dados

In [8]:
df['Media'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].mean())

In [9]:
df['Moda'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].mode().iloc[0])

In [10]:
df['Mediana'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].median())

In [11]:
df['LOCF'] = df['Vazao1_CA_7d'].fillna(method='ffill')

In [12]:
df['BOCF'] = df['Vazao1_CA_7d'].fillna(method='bfill')

# Imputação de Dados com Médias Móveis e Interpolação

In [13]:
df['Vazao1_CA_7d'].head(15)

0     335.0
1     329.0
2     278.0
3     250.0
4     183.0
5     362.0
6     714.0
7     402.0
8     259.0
9     183.0
10    146.0
11    124.0
12    113.0
13    107.0
14     94.0
Name: Vazao1_CA_7d, dtype: float64

In [14]:
#df['Media_Movel_3dias'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].rolling(window=3, min_periods=1).mean().shift(1))


In [15]:
df['Media_Movel_7dias'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].rolling(window=7, min_periods=1).mean().shift(1))


In [16]:
df['Media_Movel_15dias'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].rolling(window=15, min_periods=1).mean().shift(1))


In [17]:
df['InterpolacaoLinear'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].interpolate(method='linear'))


In [18]:
df['InterpolacaoSpline_ordem2'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].interpolate(method='spline', order=2))

In [19]:
df['InterpolacaoSpline_ordem3'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].interpolate(method='spline', order=3))

In [20]:
df['InterpolacaoPolinomial_ordem2'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].interpolate(method='polynomial', order=2))

In [21]:
df['InterpolacaoPolinomial_ordem3'] = df['Vazao1_CA_7d'].fillna(df['Vazao1_CA_7d'].interpolate(method='polynomial', order=3))

# Imputações KNN

In [22]:
df['timestamp_numeric'] = (df['timestamp'] - df['timestamp'].min()) / pd.Timedelta(days=1)

In [23]:
imputer = KNNImputer(n_neighbors=1) 
df['KNN_1k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [24]:
imputer = KNNImputer(n_neighbors=2) 
df['KNN_2k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [25]:
imputer = KNNImputer(n_neighbors=3) 
df['KNN_3k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [26]:
imputer = KNNImputer(n_neighbors=4) 
df['KNN_4k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [27]:
imputer = KNNImputer(n_neighbors=5) 
df['KNN_5k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [28]:
imputer = KNNImputer(n_neighbors=6) 
df['KNN_6k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [29]:
imputer = KNNImputer(n_neighbors=7) 
df['KNN_7k']  = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [30]:
imputer = KNNImputer(n_neighbors=8) 
df['KNN_8k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [31]:
imputer = KNNImputer(n_neighbors=9) 
df['KNN_9k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [32]:
imputer = KNNImputer(n_neighbors=10) 
df['KNN_10k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [33]:
imputer = KNNImputer(n_neighbors=15) 
df['KNN_15k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [34]:
imputer = KNNImputer(n_neighbors=30) 
df['KNN_30k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [35]:
imputer = KNNImputer(n_neighbors=90) 
df['KNN_90k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [36]:
imputer = KNNImputer(n_neighbors=180) 
df['KNN_180k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [37]:
imputer = KNNImputer(n_neighbors=365) 
df['KNN_365k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [38]:
imputer = KNNImputer(n_neighbors=1095) 
df['KNN_1095k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

In [39]:
imputer = KNNImputer(n_neighbors=1825) 
df['KNN_1825k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao1_CA_7d', 'timestamp_numeric']])))

# Métricas


In [40]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp', 'Media', 'Moda',
       'Mediana', 'LOCF', 'BOCF', 'Media_Movel_7dias', 'Media_Movel_15dias',
       'InterpolacaoLinear', 'InterpolacaoSpline_ordem2',
       'InterpolacaoSpline_ordem3', 'InterpolacaoPolinomial_ordem2',
       'InterpolacaoPolinomial_ordem3', 'timestamp_numeric', 'KNN_1k',
       'KNN_2k', 'KNN_3k', 'KNN_4k', 'KNN_5k', 'KNN_6k', 'KNN_7k', 'KNN_8k',
       'KNN_9k', 'KNN_10k', 'KNN_15k', 'KNN_30k', 'KNN_90k', 'KNN_180k',
       'KNN_365k', 'KNN_1095k', 'KNN_1825k'],
      dtype='object')

In [41]:
colunas = df.columns
nulos = df[df['Vazao1_CA_7d'].isnull()].index

In [42]:
nulos

Int64Index([1420, 1421, 1422, 1423, 1424, 1425, 1426, 3098, 3099, 3100, 3101,
            3102, 3103, 3104, 3152, 3153, 3154, 3155, 3156, 3157, 3158, 4128,
            4129, 4130, 4131, 4132, 4133, 4134, 4711, 4712, 4713, 4714, 4715,
            4716, 4717, 4791, 4792, 4793, 4794, 4795, 4796, 4797, 5192, 5193,
            5194, 5195, 5196, 5197, 5198, 5846, 5847, 5848, 5849, 5850, 5851,
            5852, 6139, 6140, 6141, 6142, 6143, 6144, 6145, 6640, 6641, 6642,
            6643, 6644, 6645, 6646, 7355, 7356, 7357, 7358, 7359, 7360, 7361,
            7630, 7631, 7632, 7633, 7634, 7635, 7636, 7692, 7693, 7694, 7695,
            7696, 7697, 7698, 7981, 7982, 7983, 7984, 7985, 7986, 7987],
           dtype='int64')

In [43]:
mae = []
rmse = []
corr = []
mape = []
for col in colunas[12:]:
    print('------------------')
    print('Método: ', col)
    
    dict_metrics = calcula_metricas(df['Vazao_CA'][nulos], df[col][nulos])  
    mae.append(dict_metrics['mae'])
    rmse.append(dict_metrics['rmse'])
    corr.append(dict_metrics['corr'])
    mape.append(dict_metrics['mape'])

------------------
Método:  Media
Mean Absolute Error (MAE): 136.3428
Root Mean Squared Error (RMSE): 183.0950
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 399.1054
------------------
Método:  Moda
Mean Absolute Error (MAE): 97.7143
Root Mean Squared Error (RMSE): 196.6352
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 78.2805
------------------
Método:  Mediana
Mean Absolute Error (MAE): 94.2449
Root Mean Squared Error (RMSE): 176.6283
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 188.4608
------------------
Método:  LOCF
Mean Absolute Error (MAE): 103.3673
Root Mean Squared Error (RMSE): 206.2930
Pearson Correlation: 0.2290
Mean Absolute Percentual Error (MAPE): 66.5972
------------------
Método:  BOCF
Mean Absolute Error (MAE): 67.1633
Root Mean Squared Error (RMSE): 132.2655
Pearson Correlation: 0.7790
Mean Absolute Percentual Error (MAPE): 61.8178
------------------
Método:  Media_Movel_7dias
Mean Absolute Error (MAE): 93.7511




In [44]:
metodos = colunas[12:].to_list()

In [45]:
mae.append(55.14676)
rmse.append(103.4991)
corr.append(0.8086608)
mape.append(36.69077)
metodos.append('kalman_struct')

In [46]:
mae.append(58.73604)
rmse.append(124.0231)
corr.append(0.7053417)
mape.append(39.84651)
metodos.append('kalman_arima')

In [47]:
metricas = pd.DataFrame({'mae' : mae, 'rmse' : rmse, 'corr' : corr, 'mape' : mape, 'metodos' : metodos})


In [48]:
metricas.sort_values(by='mae').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
7,55.102041,103.369242,0.809181,36.657828,InterpolacaoLinear
30,55.14676,103.4991,0.808661,36.69077,kalman_struct
31,58.73604,124.0231,0.705342,39.84651,kalman_arima
13,64.244898,128.572381,0.737108,39.638057,KNN_1k
4,67.163265,132.265503,0.77898,61.817791,BOCF
14,68.30102,136.959559,0.682185,44.609547,KNN_2k
8,73.33129,145.277136,0.815047,51.645637,InterpolacaoSpline_ordem2
10,74.291412,146.959836,0.812791,52.232128,InterpolacaoPolinomial_ordem2
25,75.048299,129.05641,0.694525,107.965149,KNN_90k
6,79.046203,174.179331,0.274275,55.260141,Media_Movel_15dias


In [49]:
metricas.sort_values(by='rmse').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
7,55.102041,103.369242,0.809181,36.657828,InterpolacaoLinear
30,55.14676,103.4991,0.808661,36.69077,kalman_struct
31,58.73604,124.0231,0.705342,39.84651,kalman_arima
13,64.244898,128.572381,0.737108,39.638057,KNN_1k
25,75.048299,129.05641,0.694525,107.965149,KNN_90k
4,67.163265,132.265503,0.77898,61.817791,BOCF
14,68.30102,136.959559,0.682185,44.609547,KNN_2k
26,98.736054,144.37497,0.636432,307.934715,KNN_180k
8,73.33129,145.277136,0.815047,51.645637,InterpolacaoSpline_ordem2
10,74.291412,146.959836,0.812791,52.232128,InterpolacaoPolinomial_ordem2


In [50]:
metricas.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
8,73.33129,145.277136,0.815047,51.645637,InterpolacaoSpline_ordem2
10,74.291412,146.959836,0.812791,52.232128,InterpolacaoPolinomial_ordem2
7,55.102041,103.369242,0.809181,36.657828,InterpolacaoLinear
30,55.14676,103.4991,0.808661,36.69077,kalman_struct
4,67.163265,132.265503,0.77898,61.817791,BOCF
13,64.244898,128.572381,0.737108,39.638057,KNN_1k
11,97.698379,201.167103,0.733996,68.733532,InterpolacaoPolinomial_ordem3
9,97.959593,201.231923,0.733262,68.145319,InterpolacaoSpline_ordem3
31,58.73604,124.0231,0.705342,39.84651,kalman_arima
25,75.048299,129.05641,0.694525,107.965149,KNN_90k


In [51]:
metricas.sort_values(by='mape').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
7,55.102041,103.369242,0.809181,36.657828,InterpolacaoLinear
30,55.14676,103.4991,0.808661,36.69077,kalman_struct
13,64.244898,128.572381,0.737108,39.638057,KNN_1k
31,58.73604,124.0231,0.705342,39.84651,kalman_arima
14,68.30102,136.959559,0.682185,44.609547,KNN_2k
8,73.33129,145.277136,0.815047,51.645637,InterpolacaoSpline_ordem2
10,74.291412,146.959836,0.812791,52.232128,InterpolacaoPolinomial_ordem2
6,79.046203,174.179331,0.274275,55.260141,Media_Movel_15dias
4,67.163265,132.265503,0.77898,61.817791,BOCF
15,92.72449,254.047885,0.389766,63.646006,KNN_3k
