In [1]:
############# Loading ##############

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

epsoln = 1e-9  # Define a small epsilon value for division by zero cases

def rmse(y_true, y_pred):
  return np.sqrt(mse(y_true, y_pred))

def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_pred - y_true)/np.sum(y_true))
  else:
    return 100*(np.sum(y_pred - y_true)/(np.sum(y_true)+epsoln))

def pocid(y_true, y_pred):
  n = len(y_true)
  D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
  POCID = 100 * np.sum(D) / (n-1)
  return POCID

def mcpm(rmse_result, mape_result, pocid_result):
  er_result = 100 - pocid_result

  A1 = (rmse_result * mape_result * np.sin((2*np.pi)/3))/2
  A2 = (mape_result * er_result * np.sin((2*np.pi)/3))/2
  A3 = (er_result * rmse_result * np.sin((2*np.pi)/3))/2
  total = A1 + A2 + A3
  return total

def znorm(x):
  if np.std(x) != 0: 
      x_znorm = (x - np.mean(x)) / np.std(x)
  else:
      x_znorm = (x - np.mean(x)) / (np.std(x) + epsoln)
  return x_znorm

def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Em geral, considera-se um tamanho de janela capaz de capturar um ciclo dos dados
# Por exemplo, 12 observações no caso dos dados com frequência mensal
def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df

def rolling_window_12(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+12]))
    data.append(example)
  df = pd.DataFrame(data)
  return df


# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds

def targeted_forecasting(X_test, model):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(1):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds

def baseline_mean(series, horizon):
  # como as séries são normalizadas, esse baseline irá retornar uma reta próxima de zero
  pred = np.repeat(np.mean(znorm(series[:-horizon])), horizon)
  return pred

def baseline_persistent(series, horizon):
  return np.repeat(znorm(series[-2*horizon:-horizon]).values[-1], horizon)

def baseline_persistent_window(series, horizon):
  subsequence = znorm(series[-horizon*2:-horizon]).values
  return subsequence


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
######### Lista Estados ##############

import os
import csv

def extract_estado(file_name):
    # Split the file name by underscores
    parts = file_name.split('_')
    # Extract the name between underscores
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    # List all files in the folder
    files = os.listdir(folder_path)
    # Iterate through each file
    for file_name in files:
        # Check if it's a CSV file
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            # Open the CSV file and read the data
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                # Assuming the first row contains headers
                headers = next(reader)
                # Extract estado from file name and append to estados list
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados

# Specify the folder path where CSV files are located


In [8]:
######### Regressors #######################

products = sorted([name for name in os.listdir('./uf/') if os.path.isdir(os.path.join('./uf/', name))])

horizon = 12
window = 12

for product in products:
    folder_path = f'./uf/{product}/'
    # Read the CSV files and extract estado names
    estados = read_csv_files(folder_path)
    
    for estado in estados:
    
        # carregamento do arquivo
        df = pd.read_csv(f"./uf/{product}/mensal_{estado}_{product}.csv", header=0, sep=";")

        series = df["m3"]

        # Parâmetros
        # - horizon: horizonte de predição >> Padrão 12
        # - window: comprimento da janela para a geração da tabela atributo-valor >> Padrão 12

        # Geração da tabela atributo-valor (normalizado)
        # data = rolling_window_12(series, window) or

        data = rolling_window(series, window+11)

        # List to store the new datasets
        datasets = []

        # Iterate over the range of columns from 13 to 24
        for i in range(13, 25):
            # Create a new DataFrame with the first 12 columns and the column at the current position
            new_df = data.iloc[:, list(range(12)) + [i - 1]]
            datasets.append(new_df)

        # Display the first dataset to verify
        # print(datasets[0])

        indices1 = ""
        indices2 = ""
        indices3 = ""
        indices4 = ""
        indices5 = ""
        indices6 = ""
        indices7 = ""

        out_pred = []

        for u in range(12):
            
            # Divisão dos dados em treino/teste considerando o horizonte de predição de 12 meses
            X_train, X_test, y_train, y_test = train_test_split(datasets[u], horizon)

            ############################################# lgb
            num_round = 100

            # Convert data to LightGBM dataset format
            train_data = lgb.Dataset(X_train, label=y_train)
            test_data = lgb.Dataset(X_test, label=y_test)

            ############################################

            ############## Regressores ##################
            # regr1 = LinearRegression()
            regr2 = KNeighborsRegressor(n_neighbors = 3)
            regr3 = XGBRegressor()
            regr4 = SVR(kernel='rbf')
            regr5 = RandomForestRegressor()
            # regr6 = MLPRegressor(random_state=1, activation='relu', max_iter=500)
            regr7 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse',
                                    num_leaves=31, learning_rate=0.05, feature_fraction=0.9,
                                    bagging_fraction=0.8, bagging_freq=5, verbose=-1)

            # Treinamento dos modelos
            regr2.fit(X_train, y_train)
            predictions2a = targeted_forecasting(X_test, regr2)
            regr3.fit(X_train, y_train)
            predictions3a = targeted_forecasting(X_test, regr3)
            regr4.fit(X_train, y_train)
            predictions4a = targeted_forecasting(X_test, regr4)
            regr5.fit(X_train, y_train)
            predictions5a = targeted_forecasting(X_test, regr5)
            # regr6.fit(X_train, y_train)
            # predictions6a = targeted_forecasting(X_test, regr6)
            regr7.fit(X_train, y_train)
            predictions7a = targeted_forecasting(X_test, regr7)



            ##################################################################################################
            
            # Recupera a média e desvio-padrão da última subsequência observada
            mean_norm, std_norm = get_stats_norm(series, horizon, window+12)

            # # Reescala a predição
            # # predictions1 = znorm_reverse(predictions1a, mean_norm, std_norm)
            predictions2 = znorm_reverse(predictions2a, mean_norm, std_norm)
            predictions3 = znorm_reverse(predictions3a, mean_norm, std_norm)
            predictions4 = znorm_reverse(predictions4a, mean_norm, std_norm)
            predictions5 = znorm_reverse(predictions5a, mean_norm, std_norm)
            # predictions6 = znorm_reverse(predictions6a, mean_norm, std_norm)
            predictions7 = znorm_reverse(predictions7a, mean_norm, std_norm)


            ##################################################################################################
   
            indices2 += str(predictions2) + " "
            indices3 += str(predictions3) + " "
            indices4 += str(predictions4) + " "
            indices5 += str(predictions5) + " "
            # indices6 += str(predictions6) + " "
            indices7 += str(predictions7) + " "


        rows_data = [
            # [product,estado,'LR',fd1,mape_result1,pocid_result1,pbe_result1,p1],
            [product,estado,'kNN',indices2],
            [product,estado,'XGB',indices3],
            [product,estado,'SVR',indices4],
            [product,estado,'RF',indices5],
            # [product,estado,'MLP',indices6, predictions6],
            [product,estado,'LGB',indices7],   
        ]


        # CSV Output VALORES REAIS
        with open(f'NEWStrategy_RawData_{window}_output.csv', 'a', newline='') as file:
            writer = csv.writer(file)
            for row_data in rows_data:
                writer.writerow(row_data)



KeyboardInterrupt: 