In [182]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from skopt import dummy_minimize, gp_minimize
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, rankdata

In [183]:
df = pd.read_csv('data/vazoes1_CA_20_23.csv').iloc[:8400, ]

In [184]:
df2 = pd.read_excel('data/vazoes_20_23.xlsx')

In [185]:
df2.columns

Index(['Data', 'Vazao_MC', 'Vazao_CA', 'Vazao_Jul'], dtype='object')

In [186]:
df2 = df2.dropna()

In [187]:
df

Unnamed: 0.1,Unnamed: 0,Data,Vazao_CA,Vazao1_CA_1d,Vazao1_CA_7d,Vazao1_CA_15d,Vazao1_CA_30d
0,1,2000-03-01,335,335.0,335.0,335.0,335.0
1,2,2000-03-02,329,329.0,329.0,329.0,329.0
2,3,2000-03-03,278,278.0,278.0,278.0,278.0
3,4,2000-03-04,250,250.0,250.0,250.0,250.0
4,5,2000-03-05,183,183.0,183.0,183.0,183.0
...,...,...,...,...,...,...,...
8395,8396,2023-02-24,183,183.0,,183.0,183.0
8396,8397,2023-02-25,165,165.0,,165.0,165.0
8397,8398,2023-02-26,133,133.0,,133.0,133.0
8398,8399,2023-02-27,106,106.0,,106.0,106.0


In [188]:
df['Data'] = pd.to_datetime(df['Data'])


In [189]:
df

Unnamed: 0.1,Unnamed: 0,Data,Vazao_CA,Vazao1_CA_1d,Vazao1_CA_7d,Vazao1_CA_15d,Vazao1_CA_30d
0,1,2000-03-01,335,335.0,335.0,335.0,335.0
1,2,2000-03-02,329,329.0,329.0,329.0,329.0
2,3,2000-03-03,278,278.0,278.0,278.0,278.0
3,4,2000-03-04,250,250.0,250.0,250.0,250.0
4,5,2000-03-05,183,183.0,183.0,183.0,183.0
...,...,...,...,...,...,...,...
8395,8396,2023-02-24,183,183.0,,183.0,183.0
8396,8397,2023-02-25,165,165.0,,165.0,165.0
8397,8398,2023-02-26,133,133.0,,133.0,133.0
8398,8399,2023-02-27,106,106.0,,106.0,106.0


In [190]:
df['Dia'] = df['Data'].dt.day
df['DiaDaSemana'] = df['Data'].dt.day_name()
df['Mês'] = df['Data'].dt.month
df['Ano'] = df['Data'].dt.year
df['Semana'] = df['Data'].dt.week

  df['Semana'] = df['Data'].dt.week


In [191]:
df

Unnamed: 0.1,Unnamed: 0,Data,Vazao_CA,Vazao1_CA_1d,Vazao1_CA_7d,Vazao1_CA_15d,Vazao1_CA_30d,Dia,DiaDaSemana,Mês,Ano,Semana
0,1,2000-03-01,335,335.0,335.0,335.0,335.0,1,Wednesday,3,2000,9
1,2,2000-03-02,329,329.0,329.0,329.0,329.0,2,Thursday,3,2000,9
2,3,2000-03-03,278,278.0,278.0,278.0,278.0,3,Friday,3,2000,9
3,4,2000-03-04,250,250.0,250.0,250.0,250.0,4,Saturday,3,2000,9
4,5,2000-03-05,183,183.0,183.0,183.0,183.0,5,Sunday,3,2000,9
...,...,...,...,...,...,...,...,...,...,...,...,...
8395,8396,2023-02-24,183,183.0,,183.0,183.0,24,Friday,2,2023,8
8396,8397,2023-02-25,165,165.0,,165.0,165.0,25,Saturday,2,2023,8
8397,8398,2023-02-26,133,133.0,,133.0,133.0,26,Sunday,2,2023,8
8398,8399,2023-02-27,106,106.0,,106.0,106.0,27,Monday,2,2023,9


In [192]:
df2.columns

Index(['Data', 'Vazao_MC', 'Vazao_CA', 'Vazao_Jul'], dtype='object')

In [193]:
df = df.assign(**pd.get_dummies(df[['DiaDaSemana']], prefix='Day'))

In [194]:
df['MC'] = df2['Vazao_MC']

In [195]:
for i in range(1, 8):
    df[f'Dia_Anterior_{i}'] = df['Vazao1_CA_1d'].shift(i)

In [211]:
df['InterpolacaoLinear'] = df['Vazao1_CA_1d'].fillna(df['Vazao1_CA_1d'].interpolate(method='linear'))
df['InterpolacaoSpline_ordem2'] = df['Vazao1_CA_1d'].fillna(df['Vazao1_CA_1d'].interpolate(method='spline', order=2))
df['InterpolacaoSpline_ordem3'] = df['Vazao1_CA_1d'].fillna(df['Vazao1_CA_1d'].interpolate(method='spline', order=3))
df['InterpolacaoPolinomial_ordem2'] = df['Vazao1_CA_1d'].fillna(df['Vazao1_CA_1d'].interpolate(method='polynomial', order=2))
df['InterpolacaoPolinomial_ordem3'] = df['Vazao1_CA_1d'].fillna(df['Vazao1_CA_1d'].interpolate(method='polynomial', order=3))

In [214]:
df_imputed = df.copy()

In [221]:
df_missing = df_imputed[df_imputed['Vazao1_CA_1d'].isnull()]
df_not_missing = df_imputed.dropna(subset=['Vazao1_CA_1d'])

In [222]:
X_train = df_not_missing[['Dia','Mês', 'Ano', 'Semana', 'Day_Friday', 
                             'Dia_Anterior_1', 'Dia_Anterior_2', 
                            'Dia_Anterior_3', 'Dia_Anterior_4', 'Dia_Anterior_5', 'Dia_Anterior_6', 
                            'Dia_Anterior_7', 'InterpolacaoLinear',
                            'InterpolacaoSpline_ordem2', 'InterpolacaoSpline_ordem3', 'InterpolacaoPolinomial_ordem2',
                            'InterpolacaoPolinomial_ordem3']].iloc[7:,]
y_train = df_not_missing['Vazao1_CA_1d'][7:,]
X_test = df_missing[['Dia','Mês', 'Ano', 'Semana', 'Day_Friday', 
                             'Dia_Anterior_1', 'Dia_Anterior_2', 
                            'Dia_Anterior_3', 'Dia_Anterior_4', 'Dia_Anterior_5', 'Dia_Anterior_6', 
                            'Dia_Anterior_7', 'InterpolacaoLinear',
                            'InterpolacaoSpline_ordem2', 'InterpolacaoSpline_ordem3', 'InterpolacaoPolinomial_ordem2',
                            'InterpolacaoPolinomial_ordem3']]
y_test = df_missing['Vazao_CA']
                                    

In [223]:
X_train.fillna(method='ffill', inplace=True)
X_test.fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.fillna(method='ffill', inplace=True)


In [224]:
len(X_test)

404

In [225]:
df_not_missing.isna().sum()

Unnamed: 0                         0
Data                               0
Vazao_CA                           0
Vazao1_CA_1d                       0
Vazao1_CA_7d                     390
Vazao1_CA_15d                    381
Vazao1_CA_30d                    399
Dia                                0
DiaDaSemana                        0
Mês                                0
Ano                                0
Semana                             0
Day_Friday                         0
Day_Monday                         0
Day_Saturday                       0
Day_Sunday                         0
Day_Thursday                       0
Day_Tuesday                        0
Day_Wednesday                      0
MC                                 0
Dia_Anterior_1                   383
Dia_Anterior_2                   389
Dia_Anterior_3                   392
Dia_Anterior_4                   394
Dia_Anterior_5                   392
Dia_Anterior_6                   387
Dia_Anterior_7                   394
I

In [226]:
def training_model_rf(params):
    min_samples_leaf = params[0]
    max_depth = params[1]
    max_features = params[2]
    max_leaf_nodes = params[3]
    display(clear=True)
    model = RandomForestRegressor(min_samples_leaf = min_samples_leaf, max_depth=max_depth, max_features=max_features,
                        max_leaf_nodes=max_leaf_nodes, random_state = 0)
    display(clear=True)
    model.fit(X_train, y_train)
    display(clear=True)
    mse = np.mean(cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, scoring="neg_mean_absolute_error"))
    
    return mse

def model_trained_rf(params):
    min_samples_leaf = params[0]
    max_depth = params[1]
    max_features = params[2]
    max_leaf_nodes = params[3]
    display(clear=True)
    model = RandomForestRegressor(min_samples_leaf = min_samples_leaf, max_depth=max_depth, max_features=max_features,
                        max_leaf_nodes=max_leaf_nodes, random_state = 0)
    display(clear=True)
    model.fit(X_train, y_train)
    display(clear=True)
    return model
space = [(1, 50), #min_samples_leaf
            (1, 30), #max_depth
            ("sqrt", "log2", None), #max_features
            (2, 100), #max_leaf_nodes

            ]
resultado = dummy_minimize(training_model_rf, space, random_state=1, verbose=0, n_calls=30)
resultado_gp = gp_minimize(training_model_rf, space, random_state=1, verbose=0, n_calls=30)





In [227]:
rf = model_trained_rf(resultado_gp.x)

In [228]:
def percentual_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [229]:
def calcula_metricas(values_true, values_predicted):
    # Calcular métricas
    mae = mean_absolute_error(values_true, values_predicted)
    rmse = np.sqrt(mean_squared_error(values_true, values_predicted))
    correlation, _ = pearsonr(values_true, values_predicted)
    mape = percentual_error(values_true, values_predicted)

    # Exibir resultados
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Pearson Correlation: {correlation:.4f}")
    print(f"Mean Absolute Percentual Error (MAPE): {mape:.4f}")

    return {'mae': mae, 'rmse': rmse, 'corr' : correlation, 'mape' : mape}

In [230]:
predicao = rf.predict(X_test)

In [231]:
calcula_metricas(y_test, predicao)

Mean Absolute Error (MAE): 108.1155
Root Mean Squared Error (RMSE): 179.8163
Pearson Correlation: 0.7366
Mean Absolute Percentual Error (MAPE): 141.2412


{'mae': 108.11548255023895,
 'rmse': 179.81632971384613,
 'corr': 0.736645725690364,
 'mape': 141.24117593337644}

In [232]:
rf.feature_importances_

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.21, 0.21, 0.19, 0.19, 0.2 ])

In [233]:
rf.feature_names_in_

array(['Dia', 'Mês', 'Ano', 'Semana', 'Day_Friday', 'Dia_Anterior_1',
       'Dia_Anterior_2', 'Dia_Anterior_3', 'Dia_Anterior_4',
       'Dia_Anterior_5', 'Dia_Anterior_6', 'Dia_Anterior_7',
       'InterpolacaoLinear', 'InterpolacaoSpline_ordem2',
       'InterpolacaoSpline_ordem3', 'InterpolacaoPolinomial_ordem2',
       'InterpolacaoPolinomial_ordem3'], dtype=object)

In [234]:
predicao = rf.predict(X_train)

In [235]:
calcula_metricas(y_train, predicao)

Mean Absolute Error (MAE): 105.9954
Root Mean Squared Error (RMSE): 179.3868
Pearson Correlation: 0.8114
Mean Absolute Percentual Error (MAPE): 156.9198


{'mae': 105.99537601550925,
 'rmse': 179.38677244808383,
 'corr': 0.8113914599328751,
 'mape': 156.91981989440367}

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Obtaining dependency information for ucimlrepo from https://files.pythonhosted.org/packages/3e/4a/ecc3456479d687202b34ee42317c3a63e09793c9409a720052d38356431a/ucimlrepo-0.0.3-py3-none-any.whl.metadata
  Downloading ucimlrepo-0.0.3-py3-none-any.whl.metadata (5.2 kB)
Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3
Note: you may need to restart the kernel to use updated packages.
