In [1]:
####### LOADING ##########

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import csv
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features
from tsfresh import select_features


# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import lightgbm as lgb

import warnings
from warnings import simplefilter
warnings.filterwarnings("ignore", category=RuntimeWarning)
simplefilter(action='ignore', category=FutureWarning)

# Evaluation metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error as mape

%matplotlib inline

epslon = 0.00005


def pbe(y_true, y_pred):
  if np.sum(y_true)!=0:
    return 100*(np.sum(y_pred - y_true)/np.sum(y_true))
  else:
   return 100*(np.sum(y_pred - y_true)/(np.sum(y_true)+ epslon))  

def pocid(y_true, y_pred):
    n = len(y_true)
    D = [1 if (y_pred[i] - y_pred[i-1]) * (y_true[i] - y_true[i-1]) > 0 else 0 for i in range(1, n)]
    POCID = 100 * np.sum(D) / n
    return POCID

def znorm(x):
    mean = np.mean(x)
    std_dev = np.std(x)
    # if std_dev != 0:
    x_znorm = (x - mean) / std_dev
    # else:
        # x_znorm = (x - mean) / (std_dev + np.finfo(np.float32).eps)
    return x_znorm


def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std

# Para predição de vendas por UF (mensal), será considerado horizon = 12
# Para predição de vendas por município (anual), será considerado horizon = 1
def train_test_split(data, horizon):
  X = data.iloc[:,:-1] # features
  y = data.iloc[:,-1] # target

  X_train = X[:-horizon] # features train
  X_test =  X[-horizon:] # features test

  y_train = y[:-horizon] # target train
  y_test = y[-horizon:] # target test
  return X_train, X_test, y_train, y_test

def recursive_multistep_forecasting(X_test, model, horizon):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(horizon):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds



def rolling_window_real(series, window):
  data = []
  for i in range(len(series)-window):
    # example = znorm(np.array(series[i:i+window+1]))
    example = np.array(series[i:i+window+1])
    data.append(example)
  df = pd.DataFrame(data)
  return df


def rolling_window(series, window):
  data = []
  for i in range(len(series)-window):
    example = znorm(np.array(series[i:i+window+1]))
    data.append(example)
  df = pd.DataFrame(data)
  return df


#função para desnormatização
def znorm_reverse(x, mean_x, std_x):
  x_denormalized = (np.array(x) * std_x) + mean_x
  return x_denormalized

def get_stats_norm(series, horizon, window):
  last_subsequence = series[-(horizon+window):-horizon].values
  last_mean = np.mean(last_subsequence)
  last_std = np.std(last_subsequence)
  return last_mean, last_std




def slope(x): return (x[-1] - x[0]) / x[0] if x[0] else 0
def abs_diff_mean(x): return np.mean(np.abs(x[1:] - x[:-1])) if len(x) > 1 else 0
def diff_std(x): return np.std(x[1:] - x[:-1]) if len(x) > 1 else 0



def targeted_forecasting(X_test, model):
  # example é composto pelas últimas observações vistas
  # na prática, é o pbeprimeiro exemplo do conjunto de teste
  example = X_test.iloc[0].values.reshape(1,-1)

  preds = []
  for i in range(1):
    pred = model.predict(example)[0]
    preds.append(pred)

    # Descartar o valor da primeira posição do vetor de características
    example = example[:,1:]

    # Adicionar o valor predito na última posição do vetor de características
    example = np.append(example, pred)
    example = example.reshape(1,-1)
  return preds


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
######### Lista Estados ##############

import os
import csv

def extract_estado(file_name):
    # Split the file name by underscores
    parts = file_name.split('_')
    # Extract the name between underscores
    estado = parts[1]
    return estado

def read_csv_files(folder_path):
    estados = []
    # List all files in the folder
    files = os.listdir(folder_path)
    # Iterate through each file
    for file_name in files:
        # Check if it's a CSV file
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            # Open the CSV file and read the data
            with open(file_path, 'r', newline='') as csvfile:
                reader = csv.reader(csvfile)
                # Assuming the first row contains headers
                headers = next(reader)
                # Extract estado from file name and append to estados list
                estado = extract_estado(file_name)
                estados.append(estado)
                estados.sort()
    return estados

# Specify the folder path where CSV files are located


In [3]:
######### TsFlex AUTO #################

import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import scipy.stats as ss
import numpy as np
import tsflex
from tsflex.features import FeatureDescriptor, FeatureCollection, FuncWrapper


products = sorted([name for name in os.listdir('./uf/') if os.path.isdir(os.path.join('./uf/', name))])

horizon = 12
window = 12

for product in products:
    folder_path = f'./uf/{product}/'
    # Read the CSV files and extract estado names
    estados = read_csv_files(folder_path)
    
    for estado in estados:
    
        df = pd.read_csv(f"./uf/{product}/mensal_{estado}_{product}.csv", header=0, sep=";")

        series = df

        ###############################################################

        try:
            df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m')
        except Exception as e:
            print("Error parsing 'timestamp' column:", e)
            raise

        if 'm3' not in df.columns:
            raise ValueError("'m3' column not found in the CSV file. Columns available: {}".format(df.columns))

        df = df.set_index('timestamp')

        df = df.rename(columns={'m3': 'TMP'})


        ## 36 = 1095.75D

        ## 12 = 365.25

        feature_descriptor_skew_TMP = FeatureDescriptor(
            function=FuncWrapper(func=ss.skew, output_names="skew"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_max_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.max, output_names="max"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_mediam_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.median, output_names="mediam"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_min_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.min, output_names="min"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_var_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.var, output_names="var"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_std_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.std, output_names="std"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_entropy_TMP = FeatureDescriptor(
            function=FuncWrapper(func=ss.entropy, output_names="entropy"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_kurtosis_TMP = FeatureDescriptor(
            function=FuncWrapper(func=ss.kurtosis, output_names="kurtosis"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_mean_TMP = FeatureDescriptor(
            function=FuncWrapper(func=np.mean, output_names="mean"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_median_abs_deviation_TMP = FeatureDescriptor(
            function=FuncWrapper(func=ss.median_abs_deviation, output_names="median_abs_deviation"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_sum_TMP = FeatureDescriptor(
            function=FuncWrapper(func=sum, output_names="sum"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_slope_TMP = FeatureDescriptor(
            function=FuncWrapper(func=slope, output_names="slope"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_abs_diff_mean_TMP = FeatureDescriptor(
            function=FuncWrapper(func=abs_diff_mean, output_names="abs_diff_mean"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        feature_descriptor_diff_std_TMP = FeatureDescriptor(
            function=FuncWrapper(func=diff_std, output_names="diff_std"),
            series_name="TMP", 
            window="365.25D", stride="31D"
        )

        fc = FeatureCollection(feature_descriptors=[
            feature_descriptor_skew_TMP,
            feature_descriptor_max_TMP,
            feature_descriptor_mediam_TMP,
            feature_descriptor_min_TMP,
            # feature_descriptor_var_TMP,
            feature_descriptor_std_TMP,
            # feature_descriptor_entropy_TMP,
            feature_descriptor_kurtosis_TMP,
            feature_descriptor_mean_TMP,
            feature_descriptor_median_abs_deviation_TMP,
            feature_descriptor_sum_TMP,
            feature_descriptor_slope_TMP,
            feature_descriptor_abs_diff_mean_TMP,
            feature_descriptor_diff_std_TMP
        ])

        result_df = fc.calculate(data=df, return_df=True)

        result_df.reset_index(inplace=True)

        folder_name = 'TsFlex'
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        result_df.to_csv(f'{folder_name}/FEAT_TsFresh_{product}_{window}_{estado}.csv', index=False)

KeyboardInterrupt: 

In [20]:
folder_name

Unnamed: 0,timestamp,TMP__abs_diff_mean__w=365D_6h,TMP__diff_std__w=365D_6h,TMP__kurtosis__w=365D_6h,TMP__max__w=365D_6h,TMP__mean__w=365D_6h,TMP__mediam__w=365D_6h,TMP__median_abs_deviation__w=365D_6h,TMP__min__w=365D_6h,TMP__skew__w=365D_6h,TMP__slope__w=365D_6h,TMP__std__w=365D_6h,TMP__sum__w=365D_6h
0,1991-01-01 06:00:00,9399.940750,11808.136943,-0.790762,53118.128,40081.533077,38482.031,3383.7560,31312.518,0.764399,-0.225947,7122.536806,521059.930
1,1991-02-01 06:00:00,8602.424750,10754.982993,-0.034181,51628.862,38334.150769,36979.919,3638.4440,30402.158,0.963640,-0.074061,6466.130146,498343.960
2,1991-03-04 06:00:00,7804.250818,9939.924005,-0.352924,51628.862,38490.288250,37730.975,3673.1195,30402.158,0.808255,-0.386346,6767.683203,461883.459
3,1991-04-04 06:00:00,6397.041000,8633.212301,1.089433,51628.862,37008.814333,36148.407,3910.3670,30402.158,1.165370,-0.007759,5659.676971,444105.772
4,1991-05-05 06:00:00,6629.772182,8713.032027,0.558855,51628.862,36506.148250,36148.407,4573.3690,27738.011,0.873813,-0.249917,6169.880771,438073.779
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,2023-09-05 06:00:00,2358.806455,3278.199991,-1.080464,44037.299,36334.896333,34143.190,2213.5250,30775.000,0.475870,0.108604,3960.881452,436018.756
386,2023-10-06 06:00:00,3038.410636,3869.035825,0.554644,51613.955,37325.792583,34143.190,2213.5250,30775.000,1.170172,0.296045,5762.302129,447909.511
387,2023-11-06 06:00:00,3155.593727,4000.986809,-0.602654,51613.955,38100.894667,34143.190,2213.5250,30775.000,0.891627,0.197489,6609.523323,457210.736
388,2023-12-07 06:00:00,3201.145273,3657.462520,-0.819874,56015.838,39350.238167,34143.190,2213.5250,30775.000,0.847875,0.617348,8255.814025,472202.858
