In [1]:
############ LIBRARIES (updated) ###############

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import csv
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features
from tsfresh import select_features


# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore", category=RuntimeWarning)
simplefilter(action='ignore', category=FutureWarning)

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

%matplotlib inline

epslon = 0.00005

def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_pred - y_true)/np.sum(y_true))
  else:
   return 100*(np.sum(y_pred - y_true)/(np.sum(y_true)+ epslon))  

def pocid(y_true, y_pred):
  n = len(y_true)
  D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
  POCID = 100 * np.sum(D) / (n-1)
  return POCID

#função para normalização
def znorm(x):
  if np.std(x)!=0:
    x_znorm = (x - np.mean(x)) / np.std(x)
  else:
    x_znorm = (x - np.mean(x)) / (np.std(x) + epslon) 
   
  return x_znorm

#função para desnormatização
def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Em geral, considera-se um tamanho de janela capaz de capturar um ciclo dos dados
# Por exemplo, 12 observações no caso dos dados com frequência mensal
def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df

def rolling_window_real(series, window):
  data = []
  for i in range(len(series)-window):
    # example = znorm(np.array(series[i:i+window+1]))
    example = np.array(series[i:i+window+1])
    data.append(example)
  df = pd.DataFrame(data)
  return df

# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds


def rolling_window_TsFresh(series, window):
    concatenated_results = None  # Initialize as None
    for i in range(len(series) - window):  # Adjusted range to include the last window
        target_feat = series[i:i+window]
        target_feat2 = pd.DataFrame(target_feat)

        target_feat2.insert(0, 'id', 'D1')  # Example: inserting 'D1' as id

        result = extract_features(target_feat2, column_id="id")

        result_values = result.values
        
        if concatenated_results is None:
            concatenated_results = result_values
        else:
            concatenated_results = np.concatenate((concatenated_results, result_values), axis=0)
    
    return concatenated_results


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
#### LEITURA DOS DADOS ####

def extract_estado(file_name):
    # Split the file name by underscores
    parts = file_name.split('_')
    # Extract the name between underscores
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    # List all files in the folder
    files = os.listdir(folder_path)
    # Iterate through each file
    for file_name in files:
        # Check if it's a CSV file
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            # Open the CSV file and read the data
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                # Assuming the first row contains headers
                headers = next(reader)
                # Extract estado from file name and append to estados list
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados



In [3]:
############## TsFresh AUTO ##############

import pickle

horizon = 12
window = 12
features = 'TsFresh'

products = sorted([name for name in os.listdir('./') if os.path.isdir(os.path.join('./', name))])
 
for product in products:
    folder_path = f'./{product}/'
    # Read the CSV files and extract estado names
    estados = read_csv_files(folder_path)
    
    for estado in estados:

        # carregamento do arquivo

        df = pd.read_csv(f"./{product}/mensal_{estado}_{product}.csv", header=0, sep=";")
        series = df['m3']

        TimeStamp = df['timestamp'].tail(398) ## add
        TimeStamp.reset_index(drop=True, inplace=True) ## add

        df_feat = rolling_window_TsFresh(series, window) ## add
        df_feat2 = pd.DataFrame(df_feat)
        df_feat2 = df_feat2.fillna(0) ## add
        # df_feat2.replace([np.inf, -np.inf], 0, inplace=True)
        # df_feat3 = df_feat2.loc[:, df_feat2.nunique() > 1]


        ## Somente Features - Normalizadas ##
        # df_feat_norm = df_feat2.apply(znorm, axis=1)

        # targets = rolling_window(series, window)
        # last_column_target = targets.iloc[:, -1]

        # df_feat_norm['y'] = last_column_target
 
        # first_column = Catch22_filled.pop('timestamp') 
        df_feat2.insert(0, 'timestamp', TimeStamp) ## add

        folder_name = f'../SALVAS/{product}/'
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        df_feat2.to_csv(f'{folder_name}/FEAT_TsFresh_{product}_{window}_{estado}.csv', index=False)

        # ##################################################################################
        # # Divisão dos dados em treino/teste considerando o horizonte de predição de 12 meses
        # X_train, X_test, y_train, y_test = train_test_split(Catch22_filled, horizon)

        # #######################  lgb  #######################
        # num_round = 100

        # # Convert data to LightGBM dataset format
        # train_data = lgb.Dataset(X_train, label=y_train)
        # test_data = lgb.Dataset(X_test, label=y_test)

        # ############## Regressores ##################
        # # regr1 = LinearRegression()
        # regr2 = KNeighborsRegressor(n_neighbors = 3)
        # regr3 = XGBRegressor()
        # regr4 = SVR(kernel='rbf')
        # regr5 = RandomForestRegressor()
        # # regr6 = MLPRegressor(random_state=1, activation='relu', max_iter=500)
        # regr7 = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse', 
        #                         num_leaves=31, learning_rate=0.05, feature_fraction=0.9, 
        #                         bagging_fraction=0.8, bagging_freq=5, verbose=-1)

        # # Treinamento dos modelos
        # # regr1.fit(X_train, y_train)
        # # predictions1a = recursive_multistep_forecasting(X_test, regr1, horizon)
        # regr2.fit(X_train, y_train)
        # predictions2a = recursive_multistep_forecasting(X_test, regr2, horizon)
        # regr3.fit(X_train, y_train)
        # predictions3a = recursive_multistep_forecasting(X_test, regr3, horizon)
        # regr4.fit(X_train, y_train)
        # predictions4a = recursive_multistep_forecasting(X_test, regr4, horizon)
        # regr5.fit(X_train, y_train)
        # predictions5a = recursive_multistep_forecasting(X_test, regr5, horizon)
        # # regr6.fit(X_train, y_train)
        # # predictions6a = recursive_multistep_forecasting(X_test, regr6, horizon)
        # regr7.fit(X_train, y_train)
        # predictions7a = recursive_multistep_forecasting(X_test, regr7, horizon)

        # ##########################

        # valores_reais = series.tail(12)
        # valores_reais.reset_index(inplace=True, drop=True)


        # # mape_result1 = mape(valores_reais, predictions1a)
        # mape_result2 = mape(valores_reais, predictions2a)
        # mape_result3 = mape(valores_reais, predictions3a)
        # mape_result4 = mape(valores_reais, predictions4a)
        # mape_result5 = mape(valores_reais, predictions5a)
        # # mape_result6 = mape(valores_reais, predictions6a)
        # mape_result7 = mape(valores_reais, predictions7a)

        # # pbe_result1 = pbe(valores_reais, predictions1a)
        # pbe_result2 = pbe(valores_reais, predictions2a)
        # pbe_result3 = pbe(valores_reais, predictions3a)
        # pbe_result4 = pbe(valores_reais, predictions4a)
        # pbe_result5 = pbe(valores_reais, predictions5a)
        # # pbe_result6 = pbe(valores_reais, predictions6a)
        # pbe_result7 = pbe(valores_reais, predictions7a)

        # # pocid_result1 = pocid(valores_reais, predictions1a)
        # pocid_result2 = pocid(valores_reais, predictions2a)
        # pocid_result3 = pocid(valores_reais, predictions3a)
        # pocid_result4 = pocid(valores_reais, predictions4a)
        # pocid_result5 = pocid(valores_reais, predictions5a)
        # # pocid_result6 = pocid(valores_reais, predictions6a)
        # pocid_result7 = pocid(valores_reais, predictions7a)


        # # Define the folder path
        # folder_path = f"../00-MODELS_UF_MENSAL/"

        # # Create the directory if it doesn't exist
        # os.makedirs(folder_path, exist_ok=True)

        # # p1 = ', '.join(map(str, predictions1a))
        # p2 = ', '.join(map(str, predictions2a))
        # p3 = ', '.join(map(str, predictions3a))
        # p4 = ', '.join(map(str, predictions4a))
        # p5 = ', '.join(map(str, predictions5a))
        # # p6 = ', '.join(map(str, predictions6a))
        # p7 = ', '.join(map(str, predictions7a))

        
        # with open(os.path.join(folder_path, f'{product}_{estado}_KNeighborsRegressor_{features}_{window}_model.pkl'), 'wb') as fd2:
        #     pickle.dump(regr2, fd2)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_XGBRegressor_{features}_{window}_model.pkl'), 'wb') as fd3:
        #     pickle.dump(regr3, fd3)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_SVR_{features}_{window}_model.pkl'), 'wb') as fd4:
        #     pickle.dump(regr4, fd4)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_RandomForestRegressor_{features}_{window}_model.pkl'), 'wb') as fd5:
        #     pickle.dump(regr5, fd5)
        
        # with open(os.path.join(folder_path, f'{product}_{estado}_LGBMRegressor_{features}_{window}_model.pkl'), 'wb') as fd7:
        #     pickle.dump(regr7, fd7)
        
        #     rows_data = [
        #         # [product,estado,'LR',fd1,mape_result1,pocid_result1,pbe_result1,p1],
        #         [product,estado,'kNN',fd2,mape_result2,pocid_result2,pbe_result2,p2],
        #         [product,estado,'XGB',fd3,mape_result3,pocid_result3,pbe_result3,p3],
        #         [product,estado,'SVR',fd4,mape_result4,pocid_result4,pbe_result4,p4],
        #         [product,estado,'RF',fd5,mape_result5,pocid_result5,pbe_result5,p5],
        #         # [product,estado,'MLP',fd6,mape_result6,pocid_result6,pbe_result6,p6],
        #         [product,estado,'LGB',fd7,mape_result7,pocid_result7,pbe_result7,p7]             
        #     ]
        
        
        # # CSV Output VALORES REAIS
        # with open(f'Metrics_{features}_{window}_output.csv', 'a', newline='') as file:
        #     writer = csv.writer(file)
        #     for row_data in rows_data:
        #         writer.writerow(row_data)

Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.53it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 28.60it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 29.99it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 28.47it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 28.88it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 27.77it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.70it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.49it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.70it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 29.44it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 29.50it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.47it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.56it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.36it/s]
Feature Extraction: 100%|██████████| 1/1 [00:00<00:00, 30.71it/s]
Feature Ex