In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata
from sklearn.impute import KNNImputer
import seaborn as sns


In [2]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [4]:
df = pd.read_csv('data/vazoes_CA_20_23.csv')

In [5]:
df['timestamp'] = pd.to_datetime(df['Data'])

In [6]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp'],
      dtype='object')

# Imputação Estática de Dados

In [7]:
df['Media'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].mean())

In [8]:
df['Moda'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].mode().iloc[0])

In [9]:
df['Mediana'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].median())

In [10]:
df['LOCF'] = df['Vazao2_CA_30d'].fillna(method='ffill')

In [11]:
df['BOCF'] = df['Vazao2_CA_30d'].fillna(method='bfill')

# Imputação de Dados com Médias Móveis e Interpolação

In [12]:
#df['Media_Movel_3dias'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].rolling(window=3, min_periods=1).mean().shift(1))


In [13]:
#df['Media_Movel_7dias'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].rolling(window=7, min_periods=1).mean().shift(1))


In [14]:
df['Media_Movel_15dias'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].rolling(window=15, min_periods=1).mean().shift(1))


In [15]:
df['InterpolacaoLinear'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].interpolate(method='linear'))


In [16]:
df['InterpolacaoSpline_ordem2'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].interpolate(method='spline', order=2))

In [17]:
df['InterpolacaoSpline_ordem3'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].interpolate(method='spline', order=3))

In [18]:
df['InterpolacaoPolinomial_ordem2'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].interpolate(method='polynomial', order=2))

In [19]:
df['InterpolacaoPolinomial_ordem3'] = df['Vazao2_CA_30d'].fillna(df['Vazao2_CA_30d'].interpolate(method='polynomial', order=3))

# Imputações KNN

In [20]:
df['timestamp_numeric'] = (df['timestamp'] - df['timestamp'].min()) / pd.Timedelta(days=1)

In [21]:
imputer = KNNImputer(n_neighbors=1) 
df['KNN_1k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [22]:
imputer = KNNImputer(n_neighbors=2) 
df['KNN_2k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [23]:
imputer = KNNImputer(n_neighbors=3) 
df['KNN_3k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [24]:
imputer = KNNImputer(n_neighbors=4) 
df['KNN_4k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [25]:
imputer = KNNImputer(n_neighbors=5) 
df['KNN_5k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [26]:
imputer = KNNImputer(n_neighbors=6) 
df['KNN_6k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [27]:
imputer = KNNImputer(n_neighbors=7) 
df['KNN_7k']  = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [28]:
imputer = KNNImputer(n_neighbors=8) 
df['KNN_8k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [29]:
imputer = KNNImputer(n_neighbors=9) 
df['KNN_9k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [30]:
imputer = KNNImputer(n_neighbors=10) 
df['KNN_10k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [31]:
imputer = KNNImputer(n_neighbors=15) 
df['KNN_15k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [32]:
imputer = KNNImputer(n_neighbors=30) 
df['KNN_30k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [33]:
imputer = KNNImputer(n_neighbors=90) 
df['KNN_90k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [34]:
imputer = KNNImputer(n_neighbors=180) 
df['KNN_180k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [35]:
imputer = KNNImputer(n_neighbors=365) 
df['KNN_365k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [36]:
imputer = KNNImputer(n_neighbors=1095) 
df['KNN_1095k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

In [37]:
imputer = KNNImputer(n_neighbors=1825) 
df['KNN_1825k'] = list(map(lambda x: x[0], imputer.fit_transform(df[['Vazao2_CA_30d', 'timestamp_numeric']])))

# Métricas


In [38]:
df.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d', 'timestamp', 'Media', 'Moda',
       'Mediana', 'LOCF', 'BOCF', 'Media_Movel_15dias', 'InterpolacaoLinear',
       'InterpolacaoSpline_ordem2', 'InterpolacaoSpline_ordem3',
       'InterpolacaoPolinomial_ordem2', 'InterpolacaoPolinomial_ordem3',
       'timestamp_numeric', 'KNN_1k', 'KNN_2k', 'KNN_3k', 'KNN_4k', 'KNN_5k',
       'KNN_6k', 'KNN_7k', 'KNN_8k', 'KNN_9k', 'KNN_10k', 'KNN_15k', 'KNN_30k',
       'KNN_90k', 'KNN_180k', 'KNN_365k', 'KNN_1095k', 'KNN_1825k'],
      dtype='object')

In [39]:
colunas = df.columns
nulos = df[df['Vazao2_CA_30d'].isnull()].index

In [40]:
nulos

Int64Index([3757, 3758, 3759, 3760, 3761, 3762, 3763, 3764, 3765, 3766, 3767,
            3768, 3769, 3770, 3771, 3772, 3773, 3774, 3775, 3776, 3777, 3778,
            3779, 3780, 3781, 3782, 3783, 3784, 3785, 3786, 4394, 4395, 4396,
            4397, 4398, 4399, 4400, 4401, 4402, 4403, 4404, 4405, 4406, 4407,
            4408, 4409, 4410, 4411, 4412, 4413, 4414, 4415, 4416, 4417, 4418,
            4419, 4420, 4421, 4422, 4423, 4780, 4781, 4782, 4783, 4784, 4785,
            4786, 4787, 4788, 4789, 4790, 4791, 4792, 4793, 4794, 4795, 4796,
            4797, 4798, 4799, 4800, 4801, 4802, 4803, 4804, 4805, 4806, 4807,
            4808, 4809],
           dtype='int64')

In [41]:
mae = []
rmse = []
corr = []
mape = []
for col in colunas[12:]:
    print('------------------')
    print('Método: ', col)
    
    dict_metrics = calcula_metricas(df['Vazao_CA'][nulos], df[col][nulos])  
    mae.append(dict_metrics['mae'])
    rmse.append(dict_metrics['rmse'])
    corr.append(dict_metrics['corr'])
    mape.append(dict_metrics['mape'])

------------------
Método:  Media
Mean Absolute Error (MAE): 152.7753
Root Mean Squared Error (RMSE): 235.0344
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 384.4330
------------------
Método:  Moda
Mean Absolute Error (MAE): 95.2111
Root Mean Squared Error (RMSE): 241.3570
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 73.1879
------------------
Método:  Mediana
Mean Absolute Error (MAE): 98.8111
Root Mean Squared Error (RMSE): 228.8383
Pearson Correlation: nan
Mean Absolute Percentual Error (MAPE): 173.9346
------------------
Método:  LOCF
Mean Absolute Error (MAE): 75.7222
Root Mean Squared Error (RMSE): 224.8607
Pearson Correlation: 0.3882
Mean Absolute Percentual Error (MAPE): 54.0282
------------------
Método:  BOCF
Mean Absolute Error (MAE): 95.5000
Root Mean Squared Error (RMSE): 203.1769
Pearson Correlation: 0.4567
Mean Absolute Percentual Error (MAPE): 68.3088
------------------
Método:  Media_Movel_15dias




ValueError: Input contains NaN.

In [None]:
metricas = pd.DataFrame({'mae' : mae, 'rmse' : rmse, 'corr' : corr, 'mape' : mape, 'metodos' : colunas[12:]})


In [None]:
metricas.sort_values(by='mae').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
4,82.111111,140.589947,0.653778,55.833444,BOCF
6,88.436111,165.736649,0.457038,58.537324,InterpolacaoLinear
12,99.4,183.197222,0.328398,62.113644,KNN_1k
2,104.833333,199.619221,,60.104539,Mediana
3,116.066667,203.916377,0.118761,74.299964,LOCF
28,118.224584,183.847951,0.176625,119.411801,KNN_1825k
27,118.820812,180.085093,0.250312,123.517069,KNN_1095k
13,119.3,227.47332,0.277709,91.444809,KNN_2k
26,120.983135,181.571199,0.211436,117.29432,KNN_365k
0,125.66717,186.012796,,131.102577,Media


In [None]:
metricas.sort_values(by='rmse').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
4,82.111111,140.589947,0.653778,55.833444,BOCF
6,88.436111,165.736649,0.457038,58.537324,InterpolacaoLinear
27,118.820812,180.085093,0.250312,123.517069,KNN_1095k
26,120.983135,181.571199,0.211436,117.29432,KNN_365k
12,99.4,183.197222,0.328398,62.113644,KNN_1k
28,118.224584,183.847951,0.176625,119.411801,KNN_1825k
0,125.66717,186.012796,,131.102577,Media
24,127.453951,186.79957,0.238174,101.983613,KNN_90k
25,143.24679,192.758512,0.137011,133.781879,KNN_180k
2,104.833333,199.619221,,60.104539,Mediana


In [None]:
metricas.sort_values(by='corr', ascending=False).head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
4,82.111111,140.589947,0.653778,55.833444,BOCF
6,88.436111,165.736649,0.457038,58.537324,InterpolacaoLinear
12,99.4,183.197222,0.328398,62.113644,KNN_1k
13,119.3,227.47332,0.277709,91.444809,KNN_2k
14,127.644444,254.866453,0.255291,97.628247,KNN_3k
27,118.820812,180.085093,0.250312,123.517069,KNN_1095k
24,127.453951,186.79957,0.238174,101.983613,KNN_90k
18,127.403175,225.698852,0.237984,102.260491,KNN_7k
11,3553.666667,4615.037108,0.237215,3070.873773,timestamp_numeric
15,126.569444,246.588297,0.234378,99.944531,KNN_4k


In [None]:
metricas.sort_values(by='mape').head(10)

Unnamed: 0,mae,rmse,corr,mape,metodos
4,82.111111,140.589947,0.653778,55.833444,BOCF
1,129.611111,225.743832,,58.048726,Moda
6,88.436111,165.736649,0.457038,58.537324,InterpolacaoLinear
2,104.833333,199.619221,,60.104539,Mediana
12,99.4,183.197222,0.328398,62.113644,KNN_1k
3,116.066667,203.916377,0.118761,74.299964,LOCF
13,119.3,227.47332,0.277709,91.444809,KNN_2k
14,127.644444,254.866453,0.255291,97.628247,KNN_3k
5,135.541025,205.560823,0.174129,98.297527,Media_Movel_15dias
15,126.569444,246.588297,0.234378,99.944531,KNN_4k
