In [2]:
############ LIBRARIES (updated) ###############

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import csv

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore", category=RuntimeWarning)
simplefilter(action='ignore', category=FutureWarning)

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

%matplotlib inline

# variavel que evita NaN nos resultados
epslon = 0.00005


def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_pred - y_true)/np.sum(y_true))
  else:
   return 100*(np.sum(y_pred - y_true)/(np.sum(y_true)+ epslon))  

def pocid(y_true, y_pred):
  n = len(y_true)
  D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
  POCID = 100 * np.sum(D) / (n-1)
  return POCID

#função para normalização
def znorm(x):
  if np.std(x)!=0:
    x_znorm = (x - np.mean(x)) / np.std(x)
  else:
    x_znorm = (x - np.mean(x)) / (np.std(x) + epslon) 
   
  return x_znorm

#função para desnormatização
def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Em geral, considera-se um tamanho de janela capaz de capturar um ciclo dos dados
# Por exemplo, 12 observações no caso dos dados com frequência mensal
def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df

# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds


#### TsFeature #########
import pickle
import os
import pandas as pd
import tsfeatures

def rolling_window_real(series, window):
  data = []
  for i in range(len(series)-window):
    # example = znorm(np.array(series[i:i+window+1]))
    example = np.array(series[i:i+window+1])
    data.append(example)
  df = pd.DataFrame(data)
  return df

def rolling_window_TsFeatures(out_window, window, product, estado, freqF):
  results=[]
  for i in range(len(out_window)-window):
    target_feat = out_window[i:window+i]
    # target_feat = target_feat.iloc[:, :-1]
    target_feat.insert(0, 'unique_id', 'D1')
    result = tsfeatures.tsfeatures(target_feat, freq=freqF)
    result = result.drop(columns=['unique_id'])
    results.append((result.values))
  return results




Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
#### LEITURA DOS DADOS ####

def extract_estado(file_name):
    # Split the file name by underscores
    parts = file_name.split('_')
    # Extract the name between underscores
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    # List all files in the folder
    files = os.listdir(folder_path)
    # Iterate through each file
    for file_name in files:
        # Check if it's a CSV file
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            # Open the CSV file and read the data
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                # Assuming the first row contains headers
                headers = next(reader)
                # Extract estado from file name and append to estados list
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados



In [6]:
############## Catch22 AUTO ##############

import pickle

horizon = 12
window = 12
features = 'TsFeatures'

products = sorted([name for name in os.listdir('./') if os.path.isdir(os.path.join('./', name))])
 
for product in products:
    folder_path = f'./{product}/'
    # Read the CSV files and extract estado names
    estados = read_csv_files(folder_path)
    
    for estado in estados:

        # carregamento do arquivo

        df = pd.read_csv(f"./{product}/mensal_{estado}_{product}.csv", header=0, sep=";")
        series = df['m3']

        TimeStamp = df['timestamp'].tail(386)
        TimeStamp.reset_index(drop=True, inplace=True)

        #### TsFeatures ############################################################# 
        out_window = rolling_window_real(df['m3'],window)

        out_window.rename(columns={window: 'y'}, inplace=True)

        # target_norm = out_window.apply(znorm, axis=1)
        # y_target_norm = target_norm['y']

        freqF = 12
        test = rolling_window_TsFeatures(out_window, window, product, estado, freqF)

        df_TsFeatures = pd.DataFrame(np.concatenate(test, axis=0), columns=[f'F_{i}' for i in range(len(test[0][0]))])
        # df_TsFeatures.drop(df_TsFeatures.columns[0], axis=1, inplace=True)
        df_TsFeatures.fillna(0, inplace=True)

        # df_TsFeatures2 = df_TsFeatures.loc[:, df_TsFeatures.nunique() > 1]

        # df_TsFeatures_norm = df_TsFeatures.apply(znorm, axis=1)
        # df_TsFeatures_norm['y'] = y_target_norm.tail(len(df_TsFeatures)).reset_index(drop=True)

        ################################################################################

        # Catch22 = rolling_window_featureCatch22(series, window)
        # Catch22_filled = Catch22.fillna(0)    
        # first_column = df_TsFeatures_norm.pop('timestamp')
        df_TsFeatures.insert(0, 'timestamp', TimeStamp)

        folder_name = f'../{product}/'
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        df_TsFeatures.to_csv(f'{folder_name}/FEAT_TsFeatures_{product}_{window}_{estado}.csv', index=False)


In [5]:
df_TsFeatures

Unnamed: 0,timestamp,F_0,F_1,F_2,F_3,F_4,F_5,F_6,F_7,F_8,...,F_32,F_33,F_34,F_35,F_36,F_37,F_38,F_39,F_40,F_41
0,199101,0.604026,12.0,-12.508192,0.159232,0.0,0.0,0.0,0.0,1.0,...,0.716126,8.0,0.0,-0.155531,0.346747,-0.443867,0.323561,-0.523748,0.356350,0.0
1,199102,0.697257,12.0,-11.759069,0.095911,0.0,0.0,0.0,0.0,1.0,...,0.843514,7.0,0.0,-0.138626,0.210951,-0.528934,0.412537,-0.729817,0.710743,0.0
2,199103,0.610302,12.0,-11.928295,0.092473,0.0,0.0,0.0,0.0,1.0,...,0.849353,8.0,0.0,-0.166882,0.226713,-0.505227,0.371309,-0.669924,0.562486,0.0
3,199104,0.546503,12.0,-11.706256,0.113413,0.0,0.0,0.0,0.0,1.0,...,0.847315,7.0,0.0,-0.152788,0.217411,-0.494595,0.354822,-0.673635,0.579243,0.0
4,199105,0.501640,12.0,-11.598055,0.105073,0.0,0.0,0.0,0.0,1.0,...,0.845618,7.0,0.0,-0.149892,0.211750,-0.496325,0.332862,-0.689265,0.612640,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381,202210,0.798389,12.0,4.695097,0.402899,0.0,0.0,0.0,0.0,1.0,...,0.707931,3.0,0.0,0.495978,0.446219,0.239048,0.244217,-0.491632,0.653921,0.0
382,202211,0.849572,12.0,2.647516,0.419850,0.0,0.0,0.0,0.0,1.0,...,0.664692,3.0,0.0,0.632926,0.742461,0.379064,0.332714,-0.302451,0.481907,0.0
383,202212,0.862291,12.0,1.060130,0.429285,0.0,0.0,0.0,0.0,1.0,...,0.513927,3.0,0.0,0.750082,1.151471,0.368103,0.307241,-0.025157,0.342010,0.0
384,202301,0.931176,12.0,4.374991,0.473306,0.0,0.0,0.0,0.0,1.0,...,0.703866,1.0,0.0,0.562505,0.907381,0.047750,0.158611,-0.141795,0.161131,0.0
