In [49]:
import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.4f}".format

model = "FT_catboost"
part = '/normal'
csv_files = [
    f"./timeseries/mestrado/resultados/{model}{part}/ANP_MONTHLY.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/AUSTRALIAN_ELECTRICITY_DEMAND_DATASET.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/M4_HOURLY_DATASET.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/M4_WEEKLY_DATASET.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/NN5_DAILY_DATASET_WITHOUT_MISSING_VALUES.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/NN5_WEEKLY_DATASET.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/PEDESTRIAN_COUNTS_DATASET.csv",
    f"./timeseries/mestrado/resultados/{model}{part}/US_BIRTHS_DATASET.csv",
]

for csv_file in csv_files:
    try:
        df = pd.read_csv(csv_file, sep=";")
        print(f"{csv_file.split('/')[-1]}: {len(df)} entries")
    except Exception as e:
        print(f"Erro ao ler {csv_file}: {e}")

ANP_MONTHLY.csv: 5642 entries
AUSTRALIAN_ELECTRICITY_DEMAND_DATASET.csv: 20 entries
M4_HOURLY_DATASET.csv: 1656 entries
M4_WEEKLY_DATASET.csv: 1436 entries
NN5_DAILY_DATASET_WITHOUT_MISSING_VALUES.csv: 444 entries
NN5_WEEKLY_DATASET.csv: 444 entries
PEDESTRIAN_COUNTS_DATASET.csv: 264 entries
US_BIRTHS_DATASET.csv: 4 entries


In [10]:
import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.4f}".format
df_agent = pd.read_csv(
    "./timeseries/mestrado/resultados/BEST_CATEGORY_EACH_SERIE/ANP_MONTHLY.csv",
    sep=";",
    )
# pegar cada valor das colunas mape, pocid, smape, rmse, msmape, mae e gerar uma media de todos
df_agent = df_agent[df_agent['final_test'] == '2024-11-30']

print(len(df_agent))
print(
    df_agent[["mape", "pocid", "smape", "rmse", "msmape", "mae"]]
    .apply(pd.to_numeric, errors="coerce")
    .mean()
)

182
mape     1826463356111233.7500
pocid                  66.5834
smape                   0.1981
rmse                 5444.5329
msmape                  0.1980
mae                  4645.5972
dtype: float64


In [22]:
ind = 0
df_viewer = df_agent[df_agent['dataset_index'] == ind]
df_viewer['description'].iloc[0]

'Models combined: statistical: ARIMA, catboost: DWT_catboost, rf: FT_rf, svr: FT_svr, naive: NaiveMovingAverage'

In [21]:
df_a = pd.read_csv(
    "./timeseries/mestrado/resultados/simple_selective2_agent_qwen3=14b/ANP_MONTHLY.csv",
    sep=";",
)
# pegar cada valor das colunas mape, pocid, smape, rmse, msmape, mae e gerar uma media de todos
df_a = df_a[df_a["final_test"] == "2024-11-30"]
df_a = df_a[df_a["dataset_index"] == ind]
df_a["description"].iloc[0]

'Combined predictions from the top-performing models selected by RMSE: ARIMA (Statistical), NaiveMovingAverage (Naive), ONLY_FT_svr (SVR), ONLY_CWT_rf (RF), and DWT_catboost (Catboost). These models were chosen for their lowest RMSE within their respective categories and representations.'

In [6]:
df_agent = pd.read_csv(
    "./timeseries/mestrado/resultados/BEST_MEAN_EACH/ANP_MONTHLY.csv",
    sep=";",
)
# pegar cada valor das colunas mape, pocid, smape, rmse, msmape, mae e gerar uma media de todos
df_agent = df_agent[df_agent['final_test'] == '2024-11-30']

print(len(df_agent))
print(
    df_agent[["mape", "pocid", "smape", "rmse", "msmape", "mae"]]
    .apply(pd.to_numeric, errors="coerce")
    .mean()
)

182
mape     4922596738537183.0000
pocid                  70.3297
smape                   0.1725
rmse                 5086.3435
msmape                  0.1724
mae                  4209.5350
dtype: float64


In [None]:
df_agent = pd.read_csv(
    "./timeseries/mestrado/resultados/simple_selective_agent_qwen3=14b/ANP_MONTHLY.csv",
    sep=";",
)
# pegar cada valor das colunas mape, pocid, smape, rmse, msmape, mae e gerar uma media de todos
df_agent = df_agent[df_agent['final_test'] == '2024-11-30']

print(len(df_agent))
print(
    df_agent[["mape", "pocid", "smape", "rmse", "msmape", "mae"]]
    .apply(pd.to_numeric, errors="coerce")
    .mean()
)

180
mape     2010481933651979.0000
pocid                  62.7273
smape                   0.2310
rmse                 6471.4706
msmape                  0.2308
mae                  5652.2558
dtype: float64


In [25]:
import re
import pandas as pd
import numpy as np


def extract_values(list_str):
    if isinstance(list_str, str):
        numbers = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", list_str)
        return [float(num) for num in numbers]
    return []


def get_best_model_by_rmse(models, dataset_index, final_test, base_path):
    """
    Encontra o melhor modelo baseado em RMSE para um dataset específico.

    Args:
        models: Lista de nomes dos modelos
        dataset_index: Índice do dataset
        final_test: Data do teste final
        base_path: Caminho base para os arquivos CSV

    Returns:
        Tupla (best_smape, best_model, best_predictions)
    """
    best_rmse = float("inf")
    best_model = ""
    best_predictions = []
    test = []
    for model in models:
        try:
            df = pd.read_csv(f"{base_path}/{model}/normal/ANP_MONTHLY.csv", sep=";")
            df = df[
                (df["final_test"] == final_test)
                & (df["dataset_index"] == dataset_index)
            ]

            if not df.empty:
                rmse = df.iloc[0]["rmse"]
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_model = model
                    best_predictions = extract_values(df.iloc[0]["predictions"])
                    test = extract_values(df.iloc[0]["test"])
        except Exception as e:
            print(f"Erro ao processar modelo {model}: {e}")

    return best_rmse, best_model, best_predictions, test


# Configurações
base_path = "/home/lucas/Documents/mestrado/Statistics_and_Seq2Seq/timeseries/mestrado/resultados"
final_test = "2024-11-30"

# Definir todos os grupos de modelos
model_groups = {
    "statistical": ["ARIMA", "ETS", "THETA"],
    "catboost": [
        "catboost",
        "CWT_catboost",
        "DWT_catboost",
        "FT_catboost",
        "ONLY_CWT_catboost",
        "ONLY_DWT_catboost",
        "ONLY_FT_catboost",
    ],
    "rf": [
        "rf",
        "CWT_rf",
        "DWT_rf",
        "FT_rf",
        "ONLY_CWT_rf",
        "ONLY_DWT_rf",
        "ONLY_FT_rf",
    ],
    "svr": [
        "svr",
        "CWT_svr",
        "DWT_svr",
        "FT_svr",
        "ONLY_CWT_svr",
        "ONLY_DWT_svr",
        "ONLY_FT_svr",
    ],
    "naive": ["NaiveSeasonal", "NaiveMovingAverage"],
}

cols_serie = [
    "dataset_index",
    "horizon",
    "regressor",
    "mape",
    "pocid",
    "smape",
    "rmse",
    "msmape",
    "mae",
    "test",
    "predictions",
    "start_test",
    "final_test",
    "description",
]

# Processar todos os datasets
combined_results = []

import os
from all_functions import *
from sklearn.metrics import mean_absolute_percentage_error as mape

dataset = "ANP_MONTHLY"
exp_name = "SMAPE_BEST_CATEGORY_EACH_SERIE"
path_experiments = f"./timeseries/mestrado/resultados/{exp_name}/"
path_csv = f"{path_experiments}/{dataset}.csv"
# os.makedirs(path_experiments, exist_ok=True)
for i in range(0, 1):
    best_results = {}
    all_predictions = []

    # Encontrar o melhor modelo de cada grupo
    for group_name, models in model_groups.items():
        rrmse, model, preds, test = get_best_model_by_rmse(
            models, i, final_test, base_path
        )
        best_results[group_name] = {
            "rmse": rrmse,
            "model": model,
            "predictions": preds,
        }

        # Coletar predições válidas para combinar
        if preds and len(preds) > 0:
            all_predictions.append(preds)

    # Combinar predições por média
    if all_predictions:
        # Converter para array numpy para facilitar a média
        predictions_array = np.array(all_predictions)
        combined_prediction = np.mean(predictions_array, axis=0).tolist()
    else:
        combined_prediction = []

    # Encontrar o melhor entre todos os grupos
    overall_best_group = min(best_results.items(), key=lambda x: x[1]["rmse"])

    print(f"\nDataset {i}:")
    for group_name, result in best_results.items():
        if result["model"]:
            print(
                f"  Melhor {group_name}: {result['model']} (RMSE: {result['rmse']:.4f})"
            )

    preds_real = combined_prediction

    test = np.array(test)
    preds_real_array = np.array(preds_real)
    preds_real_reshaped = preds_real_array.reshape(1, -1)
    test_reshaped = test.reshape(1, -1)
    smape_result = calculate_smape(preds_real_reshaped, test_reshaped)
    # print(smape_result)
    rmse_result = calculate_rmse(preds_real_reshaped, test_reshaped)
    msmape_result = calculate_msmape(preds_real_reshaped, test_reshaped)
    # mase_result = calculate_mase(preds_real_reshaped, test_reshaped, training_set, seasonality)
    mae_result = calculate_mae(preds_real_reshaped, test_reshaped)
    mape_result = mape(test, preds_real_array)
    pocid_result = pocid(test, preds_real_array)



Dataset 0:
  Melhor statistical: ARIMA (RMSE: 1746.3691)
  Melhor catboost: DWT_catboost (RMSE: 1488.4275)
  Melhor rf: FT_rf (RMSE: 1324.7528)
  Melhor svr: FT_svr (RMSE: 2551.7920)
  Melhor naive: NaiveMovingAverage (RMSE: 2632.6862)


In [None]:
import re
import pandas as pd
import numpy as np

def extract_values(list_str):
    if isinstance(list_str, str):
        numbers = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", list_str)
        return [float(num) for num in numbers]
    return []

def get_best_model_by_rmse(models, dataset_index, final_test, base_path):
    """
    Encontra o melhor modelo baseado em RMSE para um dataset específico.
    
    Args:
        models: Lista de nomes dos modelos
        dataset_index: Índice do dataset
        final_test: Data do teste final
        base_path: Caminho base para os arquivos CSV
    
    Returns:
        Tupla (best_smape, best_model, best_predictions)
    """
    best_smape = float('inf')
    best_model = ""
    best_predictions = []
    test = []
    for model in models:
        try:
            df = pd.read_csv(f"{base_path}/{model}/normal/ANP_MONTHLY.csv", sep=";")
            df = df[(df["final_test"] == final_test) & (df['dataset_index'] == dataset_index)]
            
            if not df.empty:
                smape = df.iloc[0]['smape']
                if smape < best_smape:
                    best_smape = smape
                    best_model = model
                    best_predictions = extract_values(df.iloc[0]['predictions'])
                    test = extract_values(df.iloc[0]['test'])
        except Exception as e:
            print(f"Erro ao processar modelo {model}: {e}")
    
    return best_smape, best_model, best_predictions, test

# Configurações
base_path = "/home/lucas/Documents/mestrado/Statistics_and_Seq2Seq/timeseries/mestrado/resultados"
final_test = "2024-11-30"

# Definir todos os grupos de modelos
model_groups = {
    "statistical": ["ARIMA", "ETS", "THETA"],
    "catboost": ["catboost", "CWT_catboost", "DWT_catboost", "FT_catboost", 
                 "ONLY_CWT_catboost", "ONLY_DWT_catboost", "ONLY_FT_catboost"],
    "rf": ["rf", "CWT_rf", "DWT_rf", "FT_rf", "ONLY_CWT_rf", "ONLY_DWT_rf", "ONLY_FT_rf"],
    "svr": ["svr", "CWT_svr", "DWT_svr", "FT_svr", "ONLY_CWT_svr", "ONLY_DWT_svr", "ONLY_FT_svr"],
    "naive": ["NaiveSeasonal", "NaiveMovingAverage"]
}

cols_serie = [
    "dataset_index",
    "horizon",
    "regressor",
    "mape",
    "pocid",
    "smape",
    "rmse",
    "msmape",
    "mae",
    "test",
    "predictions",
    "start_test",
    "final_test",
    "description",
]

# Processar todos os datasets
combined_results = []

import os
from all_functions import *
from sklearn.metrics import mean_absolute_percentage_error as mape

dataset = "ANP_MONTHLY"
exp_name = "SMAPE_BEST_CATEGORY_EACH_SERIE"
path_experiments = (
        f"./timeseries/mestrado/resultados/{exp_name}/"
    )
path_csv = f"{path_experiments}/{dataset}.csv"
os.makedirs(path_experiments, exist_ok=True)
for i in range(0, 182):
    best_results = {}
    all_predictions = []

    # Encontrar o melhor modelo de cada grupo
    for group_name, models in model_groups.items():
        smape, model, preds, test = get_best_model_by_rmse(models, i, final_test, base_path)
        best_results[group_name] = {
            'smape': smape,
            'model': model,
            'predictions': preds
        }

        # Coletar predições válidas para combinar
        if preds and len(preds) > 0:
            all_predictions.append(preds)

    # Combinar predições por média
    if all_predictions:
        # Converter para array numpy para facilitar a média
        predictions_array = np.array(all_predictions)
        combined_prediction = np.mean(predictions_array, axis=0).tolist()
    else:
        combined_prediction = []

    # Encontrar o melhor entre todos os grupos
    overall_best_group = min(best_results.items(), key=lambda x: x[1]['smape'])

    print(f"\nDataset {i}:")
    print(combined_prediction)
    # Mostrar os melhores de cada grupo
    for group_name, result in best_results.items():
        if result['model']:
            print(f"  Melhor {group_name}: {result['model']} (SMAPE: {result['smape']:.4f})")
            
    
    preds_real = combined_prediction
    
    test = np.array(test)
    preds_real_array = np.array(preds_real)
    preds_real_reshaped = preds_real_array.reshape(1, -1)
    test_reshaped = test.reshape(1, -1)
    smape_result = calculate_smape(preds_real_reshaped, test_reshaped)
    # print(smape_result)
    rmse_result = calculate_rmse(preds_real_reshaped, test_reshaped)
    msmape_result = calculate_msmape(preds_real_reshaped, test_reshaped)
    # mase_result = calculate_mase(preds_real_reshaped, test_reshaped, training_set, seasonality)
    mae_result = calculate_mae(preds_real_reshaped, test_reshaped)
    mape_result = mape(test, preds_real_array)
    pocid_result = pocid(test, preds_real_array)
    
    description = "Models combined: " + ", ".join([f"{group}: {best_results[group]['model']}" for group in best_results if best_results[group]['model']])
    data_serie = {
            "dataset_index": f"{i}",
            "horizon": "12",
            "regressor": "BEST_CATEGORY_EACH_SERIE",
            "mape": mape_result,
            "pocid": pocid_result,
            "smape": smape_result,
            "rmse": rmse_result,
            "msmape": msmape_result,
            "mae": mae_result,
            "test": [test.tolist()],
            "predictions": [preds_real],
            "start_test": "INICIO",
            "final_test": final_test,
            "description": description,
            # 'training_time': times[0],
            # 'prediction_time': times[1],
        }

    if not os.path.exists(path_csv):
            pd.DataFrame(columns=cols_serie).to_csv(path_csv, sep=";", index=False)

    print("Salvando resultados...\n")
    df_new = pd.DataFrame(data_serie)
    # df_new.to_csv(path_csv, sep=";", mode="a", header=False, index=False)


    # print(best_results)


Dataset 0:
[4043.123230259999, 4105.3386932739995, 4155.973103779999, 4238.927066902, 4377.449479594, 4485.4910051159995, 4574.867199220001, 4653.610762594, 4628.684474088, 4610.036991566, 4615.905459962, 4698.41472363]
  Melhor statistical: ARIMA (SMAPE: 0.2793)
  Melhor catboost: DWT_catboost (SMAPE: 0.1894)
  Melhor rf: CWT_rf (SMAPE: 0.1737)
  Melhor svr: FT_svr (SMAPE: 0.4714)
  Melhor naive: NaiveMovingAverage (SMAPE: 0.4962)
Salvando resultados...


Dataset 1:
[16633.580937906, 17485.562965226, 16191.931258802, 16282.738433745999, 15973.894368984, 16251.145912725999, 16673.514985230002, 17518.744037676, 16939.988781304, 16692.076594772, 16527.489446789998, 16428.867254329998]
  Melhor statistical: ARIMA (SMAPE: 0.1098)
  Melhor catboost: catboost (SMAPE: 0.0702)
  Melhor rf: CWT_rf (SMAPE: 0.0688)
  Melhor svr: ONLY_DWT_svr (SMAPE: 0.0700)
  Melhor naive: NaiveMovingAverage (SMAPE: 0.1227)
Salvando resultados...


Dataset 1:
[16633.580937906, 17485.562965226, 16191.931258802, 1