In [1]:
############ LOADING ###############

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import csv
import pycatch22

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore", category=RuntimeWarning)
simplefilter(action='ignore', category=FutureWarning)

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

%matplotlib inline

# variavel que evita NaN nos resultados
epslon = 0.00005

def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_pred - y_true)/np.sum(y_true))
  else:
   return 100*(np.sum(y_pred - y_true)/(np.sum(y_true)+ epslon))  

def pocid(y_true, y_pred):
    n = len(y_true)
    D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
    POCID = 100 * np.sum(D) / n
    return POCID

#função para normalização
def znorm(x):
  if np.std(x)!=0:
    x_znorm = (x - np.mean(x)) / np.std(x)
  else:
    x_znorm = (x - np.mean(x)) / (np.std(x) + epslon) 
   
  return x_znorm

#função para desnormatização
def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Em geral, considera-se um tamanho de janela capaz de capturar um ciclo dos dados
# Por exemplo, 12 observações no caso dos dados com frequência mensal
def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df

# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
#VERIFICAÇÃO DOS ESTADOS 

def extract_estado(file_name):
    # Split the file name by underscores
    parts = file_name.split('_')
    # Extract the name between underscores
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    # List all files in the folder
    files = os.listdir(folder_path)
    # Iterate through each file
    for file_name in files:
        # Check if it's a CSV file
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            # Open the CSV file and read the data
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                # Assuming the first row contains headers
                headers = next(reader)
                # Extract estado from file name and append to estados list
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados



In [98]:
############## AUTO ##############

horizon = 12
window = 12


products = sorted([name for name in os.listdir('./') if os.path.isdir(os.path.join('./', name))])
 
for product in products:
    folder_path = f'./{product}/'
    # Read the CSV files and extract estado names
    estados = read_csv_files(folder_path)
    
    for estado in estados:


        ########################################
        products = sorted([name for name in os.listdir('./uf/') if os.path.isdir(os.path.join('./uf/', name))])

        folder_path = f'./uf/{product}/'
        # Read the CSV files and extract estado names
        estados = read_csv_files(folder_path)

        # Carregamento do arquivo csv com todas as features
        df_c = pd.read_csv(f'./concatenate/{product}_{window}_{estado}_concatenated.csv')

        df_c_ts = df_c.iloc[:, 0]

        df_features = df_c.drop(df_c.columns[0], axis=1)

        df_t = pd.read_csv(f'./{product}/mensal_{estado}_{product}.csv', header=0, sep=";")

        # Exclude the first 24 rows and get the 'm3' column
        series = df_t['m3']
        target = series.tail(386).reset_index(drop=True)

        df_c_znorm = df_features.apply(znorm, axis=0)

        # # Replace NaN values with 0, if necessary
        df_c_znorm.fillna(0, inplace=True)

        # Criação do dataframe DATAIN para treino dos regressores
        df_c_znorm['y'] = target


        ########################################


        ##################################################################################
        # Divisão dos dados em treino/teste considerando o horizonte de predição de 12 meses
        X_train, X_test, y_train, y_test = train_test_split(df_c_znorm, horizon)

        #######################  lgb  #######################
        num_round = 100

        # Convert data to LightGBM dataset format
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)

        ############## Regressores ##################
        # regr1 = LinearRegression()
        regr2 = KNeighborsRegressor(n_neighbors = 3)
        regr3 = XGBRegressor()
        regr4 = SVR(kernel='rbf')
        regr5 = RandomForestRegressor()
        # regr6 = MLPRegressor(random_state=1, activation='relu', max_iter=500)
        regr7 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', 
                                num_leaves=31, learning_rate=0.05, feature_fraction=0.9, 
                                bagging_fraction=0.8, bagging_freq=5, verbose=-1)

        # Treinamento dos modelos
        # regr1.fit(X_train, y_train)
        # predictions1a = recursive_multistep_forecasting(X_test, regr1, horizon)
        regr2.fit(X_train, y_train)
        predictions2a = recursive_multistep_forecasting(X_test, regr2, horizon)
        regr3.fit(X_train, y_train)
        predictions3a = recursive_multistep_forecasting(X_test, regr3, horizon)
        regr4.fit(X_train, y_train)
        predictions4a = recursive_multistep_forecasting(X_test, regr4, horizon)
        regr5.fit(X_train, y_train)
        predictions5a = recursive_multistep_forecasting(X_test, regr5, horizon)
        # regr6.fit(X_train, y_train)
        # predictions6a = recursive_multistep_forecasting(X_test, regr6, horizon)
        regr7.fit(X_train, y_train)
        predictions7a = recursive_multistep_forecasting(X_test, regr7, horizon)

        ##########################

        valores_reais = series.tail(12)
        valores_reais.reset_index(inplace=True, drop=True)


        # mape_result1 = mape(valores_reais, predictions1a)
        mape_result2 = mape(valores_reais, predictions2a)
        mape_result3 = mape(valores_reais, predictions3a)
        mape_result4 = mape(valores_reais, predictions4a)
        mape_result5 = mape(valores_reais, predictions5a)
        # mape_result6 = mape(valores_reais, predictions6a)
        mape_result7 = mape(valores_reais, predictions7a)

        # pbe_result1 = pbe(valores_reais, predictions1a)
        pbe_result2 = pbe(valores_reais, predictions2a)
        pbe_result3 = pbe(valores_reais, predictions3a)
        pbe_result4 = pbe(valores_reais, predictions4a)
        pbe_result5 = pbe(valores_reais, predictions5a)
        # pbe_result6 = pbe(valores_reais, predictions6a)
        pbe_result7 = pbe(valores_reais, predictions7a)

        # pocid_result1 = pocid(valores_reais, predictions1a)
        pocid_result2 = pocid(valores_reais, predictions2a)
        pocid_result3 = pocid(valores_reais, predictions3a)
        pocid_result4 = pocid(valores_reais, predictions4a)
        pocid_result5 = pocid(valores_reais, predictions5a)
        # pocid_result6 = pocid(valores_reais, predictions6a)
        pocid_result7 = pocid(valores_reais, predictions7a)

        # Define the folder path
        folder_path = f"../00-MODELS_UF_MENSAL/"

        # Create the directory if it doesn't exist
        os.makedirs(folder_path, exist_ok=True)

        # p1 = ', '.join(map(str, predictions1a))
        p2 = ', '.join(map(str, predictions2a))
        p3 = ', '.join(map(str, predictions3a))
        p4 = ', '.join(map(str, predictions4a))
        p5 = ', '.join(map(str, predictions5a))
        # p6 = ', '.join(map(str, predictions6a))
        p7 = ', '.join(map(str, predictions7a))

        
        # with open(os.path.join(folder_path, f'{product}_{estado}_KNeighborsRegressor_{features}_{window}_model.pkl'), 'wb') as fd2:
        #     pickle.dump(regr2, fd2)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_XGBRegressor_{features}_{window}_model.pkl'), 'wb') as fd3:
        #     pickle.dump(regr3, fd3)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_SVR_{features}_{window}_model.pkl'), 'wb') as fd4:
        #     pickle.dump(regr4, fd4)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_RandomForestRegressor_{features}_{window}_model.pkl'), 'wb') as fd5:
        #     pickle.dump(regr5, fd5)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_LGBMRegressor_{features}_{window}_model.pkl'), 'wb') as fd7:
        #     pickle.dump(regr7, fd7)
        
        rows_data = [
            # [product,estado,'LR',fd1,mape_result1,pocid_result1,pbe_result1,p1],
            [product,estado,'kNN',mape_result2,pocid_result2,pbe_result2,p2],
            [product,estado,'XGB',mape_result3,pocid_result3,pbe_result3,p3],
            [product,estado,'SVR',mape_result4,pocid_result4,pbe_result4,p4],
            [product,estado,'RF',mape_result5,pocid_result5,pbe_result5,p5],
            # [product,estado,'MLP',fd6,mape_result6,pocid_result6,pbe_result6,p6],
            [product,estado,'LGB',mape_result7,pocid_result7,pbe_result7,p7]             
        ]
        

        
        # CSV Output VALORES REAIS
        with open(f'Metrics_ALL_{product}_{window}_output.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            for row_data in rows_data:
                writer.writerow(row_data)



In [7]:
df_c_ts = df_c.iloc[:, 0]

df_features = df_c.drop(df_c.columns[0], axis=1)

# Apply znorm function to each row
df_c_znorm = df_features.apply(znorm, axis=1)

test = znorm(df_features)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [8]:
df_features.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '773', '774', '775', '776', '777', '778', '779', '780', '781', '782'],
      dtype='object', length=1118)

In [54]:

horizon = 12
window = 12

product = "gasolinac"

products = sorted([name for name in os.listdir('./') if os.path.isdir(os.path.join('./', name))])

folder_path = f'./{product}/'
# Read the CSV files and extract estado names
estados = read_csv_files(folder_path)


# Carregamento do arquivo csv com todas as features
df_c = pd.read_csv(f'../concatenate/{product}_{window}_{estado}_concatenated.csv')

df_c_ts = df_c.iloc[:, 0]

df_features = df_c.drop(df_c.columns[0], axis=1)

##### NORMALIZAÇÃO #### FALTA NORMALIZA E TBM O TARGET

# df_c_znorm = df_c.apply(znorm, axis=1)

# # Replace NaN values with 0, if necessary
# df_c_znorm.fillna(0, inplace=True)

# Carregamento do arquivo csv com os valores de target (y em m3), atenção nas datas 
df_t = pd.read_csv(f'./{product}/mensal_{estado}_{product}.csv', header=0, sep=";")

# Exclude the first 24 rows and get the 'm3' column
series = df_t['m3']
target = series.tail(386).reset_index(drop=True)

df_c_znorm = df_features.apply(znorm)

# # Replace NaN values with 0, if necessary
df_c_znorm.fillna(0, inplace=True)

df_c_znorm = df_features

# Criação do dataframe DATAIN para treino dos regressores
df_features['y'] = target

# target.name = "y"

# df_features = pd.concat([df_features, target], axis=0) 

#### Tratamento dos dados nana inf etc
# df_features.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop columns with NaN values (which were inf)
# df_features.dropna(axis=1, how='any', inplace=True)

# Fill remaining NaN values with 0
# df_features = df_features.fillna(0)


In [55]:
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,y
0,-0.343408,-0.517257,0.946354,4,0.448078,-0.691791,1.000000,3.0,0.074074,0,...,1.747868,1.945910,1.504788,2.197225,2.079442,1.94591,1.791759,0.0,2728.563286,2455.831
1,-0.389269,-0.213282,0.864640,5,0.223144,-0.650864,1.000000,3.0,0.074074,0,...,1.747868,1.747868,1.557113,2.197225,2.079442,1.94591,1.791759,0.0,2752.061857,2110.210
2,-0.763858,-0.763858,0.748839,2,0.673012,-0.873115,1.000000,3.0,0.040000,0,...,1.549826,1.747868,1.748067,2.197225,2.079442,1.94591,1.791759,0.0,2752.061857,2245.507
3,-0.643096,-0.802415,0.740697,4,0.561440,-0.997688,1.000000,3.0,0.074074,2,...,1.747868,1.945910,1.695743,2.197225,2.079442,1.94591,1.791759,0.0,2752.061857,2448.490
4,-0.699752,-0.537048,0.776969,2,0.586707,-1.166294,1.000000,3.0,0.166667,0,...,1.747868,1.945910,1.695743,2.197225,2.079442,1.94591,1.791759,0.0,2773.309857,2261.490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,-0.975266,-1.128533,0.518049,1,0.811641,0.842387,1.000000,4.0,0.013774,0,...,1.747868,1.945910,1.695743,2.197225,2.079442,1.94591,1.791759,0.0,37268.753143,35301.000
382,-0.979418,-1.132763,0.512402,1,0.706991,0.904426,0.909091,5.0,0.005510,0,...,1.549826,1.747868,1.748067,2.197225,2.079442,1.94591,1.791759,0.0,37285.053143,34727.055
383,-0.444519,-0.283814,0.592580,1,0.620687,-1.272459,0.909091,5.0,0.005510,0,...,1.747868,1.945910,1.695743,2.043192,2.079442,1.94591,1.791759,0.0,37285.053143,39848.000
384,0.140660,-0.094939,0.683444,1,0.759316,2.191161,0.909091,5.0,0.111111,0,...,1.747868,1.945910,1.504788,2.043192,2.079442,1.94591,1.791759,0.0,37233.216286,35655.300


In [51]:
product

'queroseneiluminante'

In [46]:
df_features

df_c_znorm = df_features.apply(znorm)

# # Replace NaN values with 0, if necessary
df_c_znorm.fillna(0, inplace=True)

In [47]:
df_c_znorm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,y
0,0.140796,0.208109,-0.318455,-0.303993,-0.310465,-0.056660,-0.345213,-0.337808,-0.312445,-0.114557,...,-0.357314,-0.367580,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.308502,1.976037
1,0.558121,-0.449690,1.164291,6.564775,-0.310465,5.442955,0.087424,0.415913,5.668733,-0.114557,...,0.944837,1.592605,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.023141,0.861559
2,-0.440647,-1.166621,1.939113,5.419980,-0.310465,3.799580,0.520062,0.415913,4.560142,-0.114557,...,2.422187,2.938483,0.436467,0.291446,0.296853,0.352874,0.459653,0.0,0.119539,-0.252920
3,-0.440647,-1.166621,1.754822,0.840801,-0.156897,3.248689,0.952699,1.169634,4.560142,-0.114557,...,3.878015,2.938483,1.195895,0.905464,0.916717,1.021533,1.217816,0.0,0.119539,0.861559
4,-1.338881,-1.832660,1.527048,0.840801,1.073716,3.294825,1.385337,1.923355,1.980537,8.729261,...,3.878015,2.938483,1.925007,1.490280,1.500426,1.640387,1.899414,0.0,0.262220,0.861559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,0.140796,0.208109,-0.318455,-0.303993,-0.310465,-0.056660,-0.345213,-0.337808,-0.312445,-0.114557,...,-0.357314,-0.367580,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.308502,-0.252920
382,0.140796,0.208109,-0.318455,-0.303993,-0.310465,-0.056660,-0.345213,-0.337808,-0.312445,-0.114557,...,-0.357314,-0.367580,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.308502,-0.252920
383,0.140796,0.208109,-0.318455,-0.303993,-0.310465,-0.056660,-0.345213,-0.337808,-0.312445,-0.114557,...,-0.357314,-0.367580,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.308502,-0.252920
384,0.140796,0.208109,-0.318455,-0.303993,-0.310465,-0.056660,-0.345213,-0.337808,-0.312445,-0.114557,...,-0.357314,-0.367580,-0.349894,-0.348103,-0.353962,-0.357200,-0.359525,0.0,-0.308502,-0.252920


In [95]:

horizon = 12
window = 12

product = "gasolinac"

products = sorted([name for name in os.listdir('./') if os.path.isdir(os.path.join('./', name))])

folder_path = f'./{product}/'
# Read the CSV files and extract estado names
estados = read_csv_files(folder_path)

# Carregamento do arquivo csv com todas as features
df_c = pd.read_csv(f'../concatenate/{product}_{window}_{estado}_concatenated.csv')

df_c_ts = df_c.iloc[:, 0]

df_features = df_c.drop(df_c.columns[0], axis=1)

# Carregamento do arquivo csv com os valores de target (y em m3), atenção nas datas 
df_t = pd.read_csv(f'./{product}/mensal_{estado}_{product}.csv', header=0, sep=";")

# Exclude the first 24 rows and get the 'm3' column
series = df_t['m3']
target = series.tail(386).reset_index(drop=True)

# normalização somente de df_features

df_c_znorm = df_features.apply(znorm, axis=0)

# # Replace NaN values with 0, if necessary
df_c_znorm.fillna(0, inplace=True)

# Criação do dataframe DATAIN para treino dos regressores
df_c_znorm['y'] = target


In [96]:
df_c_znorm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,y
0,-0.246839,-0.589506,0.048393,0.549425,-1.281594,-0.820101,0.467134,-0.611513,-0.225369,-0.445823,...,1.033322,0.585361,-0.197712,1.061748,0.410004,0.153707,0.102329,0.0,-1.364229,2455.831
1,-0.339209,-0.019474,-0.129771,1.096018,-2.403133,-0.783326,0.467134,-0.611513,-0.225369,-0.445823,...,1.033322,-1.135965,0.089040,1.061748,0.410004,0.153707,0.102329,0.0,-1.362098,2110.210
2,-1.093684,-1.051946,-0.382254,-0.543761,-0.160056,-0.983032,0.467134,-0.611513,-0.755352,-0.445823,...,0.319119,-1.135965,1.135513,1.061748,0.410004,0.153707,0.102329,0.0,-1.362098,2245.507
3,-0.850453,-1.124251,-0.400006,0.549425,-0.716361,-1.094968,0.467134,-0.611513,-0.225369,2.243045,...,1.033322,0.585361,0.848761,1.061748,0.410004,0.153707,0.102329,0.0,-1.362098,2448.490
4,-0.964566,-0.626619,-0.320923,-0.543761,-0.590377,-1.246471,0.467134,-0.611513,1.214804,-0.445823,...,1.033322,0.585361,0.848761,1.061748,0.410004,0.153707,0.102329,0.0,-1.360171,2261.490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,-1.519490,-1.735806,-0.885453,-1.090354,0.531161,0.558451,0.467134,0.205248,-1.163266,-0.445823,...,1.033322,0.585361,0.848761,1.061748,0.410004,0.153707,0.102329,0.0,1.767920,35301.000
382,-1.527852,-1.743738,-0.897765,-1.090354,0.009370,0.614196,-1.581887,1.022010,-1.291810,-0.445823,...,0.319119,-1.135965,1.135513,1.061748,0.410004,0.153707,0.102329,0.0,1.769398,34727.055
383,-0.450490,-0.151739,-0.722950,-1.090354,-0.420951,-1.341866,-1.581887,1.022010,-1.291810,-0.445823,...,1.033322,0.585361,0.848761,0.152499,0.410004,0.153707,0.102329,0.0,1.769398,39848.000
384,0.728141,0.202450,-0.524836,-1.090354,0.270266,1.770406,-1.581887,1.022010,0.350700,-0.445823,...,1.033322,0.585361,-0.197712,0.152499,0.410004,0.153707,0.102329,0.0,1.764698,35655.300
