# Train and test forecast models

#### Explicación de las Métricas:
- MAE (Error Absoluto Medio): mide el error promedio en las predicciones, sin tener en cuenta la dirección del error.
- RMSE (Raíz del Error Cuadrático Medio): pondera los errores grandes más que los pequeños, siendo más sensible a valores atípicos.
- MAPE (Error Absoluto Medio Porcentual): mide el error en porcentaje, lo que lo hace independiente de la escala de los datos.

### HoltWinters

In [1]:
import pandas as pd
import numpy as np
import glob
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import warnings
import logging

# Desactiva los warnings y ajusta el logging
warnings.filterwarnings("ignore")
logging.getLogger("NP").setLevel(logging.ERROR)

Cargamos los datos:

In [2]:
# Get a list of all CSV files in the directory
csv_files = glob.glob('predictions_dia_avg/predictions_sub_0_9.csv/*.csv')

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    df_temp = pd.read_csv(file)
    dfs.append(df_temp)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Optional: Reset the index of the final DataFrame
df.reset_index(drop=True, inplace=True)

df.head()

df = df.rename(columns={'day': 'ds', 'avg_all_users': 'y'})

# Aseguramos que 'ds' es de tipo datetime
df['ds'] = pd.to_datetime(df['ds'])

# Eliminamos la información de zona horaria
df['ds'] = df['ds'].dt.tz_localize(None)

# Verifica el tipo de datos de la columna 'ds'
print(df['ds'].dtype)

df.head()

datetime64[ns]


Unnamed: 0,ds,y
0,2019-09-30,0.030181
1,2019-10-01,0.034021
2,2019-10-02,0.036998
3,2019-10-03,0.038045
4,2019-10-04,0.040502


In [5]:
df['ds'] = pd.to_datetime(df['ds'])

# Configuración de la ventana deslizante
train_size = 365   # Tamaño de ventana de entrenamiento 
test_size = 30     # Tamaño de ventana de prueba 

In [None]:
# # Lista para almacenar los resultados de cada ventana
# results = []

# # Crear ventana deslizante
# for start in range(0, len(df) - train_size - test_size + 1):
#     # Separar datos de entrenamiento y prueba
#     train = df.iloc[start:start + train_size]
#     test = df.iloc[start + train_size:start + train_size + test_size]
    
#     # Entrenar el modelo Holt-Winters en los datos de entrenamiento
#     model = ExponentialSmoothing(
#         train["y"],
#         trend="add",
#         seasonal="add",
#         seasonal_periods=12  # Ajusta según la estacionalidad de tus datos
#     ).fit()
    
#     # Hacer predicciones sobre el conjunto de prueba
#     predictions = model.forecast(test_size)
    
#     # Preparar datos de prueba y predicciones para las métricas
#     forecast_test = pd.DataFrame({
#         "y_true": test["y"],
#         "yhat1": predictions
#     })
    
#     # Calcular MAE, RMSE y MAPE para esta ventana y almacenar los resultados
#     mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat1"])
#     rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat1"]))
#     mape = mean_absolute_percentage_error(forecast_test["y_true"], forecast_test["yhat1"]) * 100
    
#     # Almacena los resultados en la lista
#     results.append({
#         "start_date": train.index[0], 
#         "end_date": test.index[-1], 
#         "MAE": mae, 
#         "RMSE": rmse,
#         "MAPE": mape
#     })

In [None]:
# # Convertir resultados en un DataFrame
# results_df = pd.DataFrame(results)

# # Calcula el promedio de cada métrica
# mean_mae = results_df["MAE"].mean()
# mean_rmse = results_df["RMSE"].mean()
# mean_mape = results_df["MAPE"].mean()

# mean_metrics = pd.DataFrame({
#     "MAE": [mean_mae],
#     "RMSE": [mean_rmse],
#     "MAPE (%)": [mean_mape]
# })

# # print(results_df)
# print("\nMétricas promedio:")
# mean_metrics


Métricas promedio:


Unnamed: 0,MAE,RMSE,MAPE (%)
0,0.000367,0.000438,8.474062


### Moving Average
El modelo de Promedio Móvil o Moving Average (MA) es más sencillo, ya que solo calcula la media de un número fijo de observaciones anteriores para hacer una predicción.

In [None]:
# # Lista para almacenar los resultados de cada ventana
# results = []

# # Crear ventana deslizante
# for start in range(0, len(df) - train_size - test_size + 1):
#     # Separar datos de entrenamiento y prueba
#     train = df.iloc[start:start + train_size]
#     test = df.iloc[start + train_size:start + train_size + test_size]
    
#     # Calcular el promedio móvil en el conjunto de entrenamiento
#     # Usamos un promedio móvil simple en los datos de entrenamiento para hacer predicciones
#     window_size = 7  # Tamaño de la ventana del promedio móvil
#     moving_avg = train["y"].rolling(window=window_size).mean().iloc[-1]
    
#     # Usamos el último promedio calculado para hacer predicciones en el conjunto de prueba
#     predictions = [moving_avg] * test_size  # Repetimos el valor del promedio para cada punto de prueba
    
#     # Preparar datos de prueba y predicciones para las métricas
#     forecast_test = pd.DataFrame({
#         "y_true": test["y"],
#         "yhat1": predictions
#     })
    
#     # Calcular MAE, RMSE y MAPE para esta ventana y almacenar los resultados
#     mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat1"])
#     rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat1"]))
#     mape = mean_absolute_percentage_error(forecast_test["y_true"], forecast_test["yhat1"]) * 100
    
#     # Almacena los resultados en la lista
#     results.append({
#         "start_date": train.index[0], 
#         "end_date": test.index[-1], 
#         "MAE": mae, 
#         "RMSE": rmse,
#         "MAPE": mape
#     })

# # Convertir resultados en un DataFrame
# results_df = pd.DataFrame(results)

# # Calcula el promedio de cada métrica
# mean_mae = results_df["MAE"].mean()
# mean_rmse = results_df["RMSE"].mean()
# mean_mape = results_df["MAPE"].mean()

# mean_metrics = pd.DataFrame({
#     "MAE": [mean_mae],
#     "RMSE": [mean_rmse],
#     "MAPE (%)": [mean_mape]
# })

# print("\nMétricas promedio:")
# mean_metrics


Métricas promedio:


Unnamed: 0,MAE,RMSE,MAPE (%)
0,0.000304,0.000368,7.155808


### NeuralNetworkFF

In [None]:
# from sklearn.neural_network import MLPRegressor
# from sklearn.preprocessing import StandardScaler

# input_window_size = 7  # Número de datos pasados a utilizar como entrada para predecir el siguiente

In [None]:
# # Lista para almacenar los resultados de cada ventana
# results = []

# # Crear ventana deslizante
# for start in range(0, len(df) - train_size - test_size + 1):
#     # Separar datos de entrenamiento y prueba
#     train = df.iloc[start:start + train_size]
#     test = df.iloc[start + train_size:start + train_size + test_size]
    
#     # Preparar los datos de entrenamiento para la red neuronal
#     X_train, y_train = [], []
#     for i in range(len(train) - input_window_size):
#         X_train.append(train["y"].iloc[i:i + input_window_size].values)
#         y_train.append(train["y"].iloc[i + input_window_size])
    
#     # Convertir a arrays de numpy
#     X_train, y_train = np.array(X_train), np.array(y_train)
    
#     # Normalizar los datos de entrada
#     scaler = StandardScaler()
#     X_train = scaler.fit_transform(X_train)
    
#     # Configurar y entrenar la red neuronal
#     model = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=0)
#     model.fit(X_train, y_train)
    
#     # Generar predicciones para el conjunto de prueba
#     X_test = []
#     for i in range(len(test) - input_window_size):
#         X_test.append(test["y"].iloc[i:i + input_window_size].values)
    
#     # Convertir a array de numpy y normalizar
#     X_test = np.array(X_test)
#     X_test = scaler.transform(X_test)
    
#     # Predecir y ajustar el tamaño de las predicciones
#     predictions = model.predict(X_test)
    
#     # Preparar datos de prueba y predicciones para las métricas
#     forecast_test = pd.DataFrame({
#         "y_true": test["y"].iloc[input_window_size:].values,  # Ajustar para igualar el tamaño de las predicciones
#         "yhat1": predictions
#     })
    
#     # Calcular MAE, RMSE y MAPE para esta ventana y almacenar los resultados
#     mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat1"])
#     rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat1"]))
#     mape = mean_absolute_percentage_error(forecast_test["y_true"], forecast_test["yhat1"]) * 100
    
#     # Almacena los resultados en la lista
#     results.append({
#         "start_date": train.index[0], 
#         "end_date": test.index[-1], 
#         "MAE": mae, 
#         "RMSE": rmse,
#         "MAPE": mape
#     })

# # Convertir resultados en un DataFrame
# results_df = pd.DataFrame(results)

# # Calcula el promedio de cada métrica
# mean_mae = results_df["MAE"].mean()
# mean_rmse = results_df["RMSE"].mean()
# mean_mape = results_df["MAPE"].mean()

# mean_metrics = pd.DataFrame({
#     "MAE": [mean_mae],
#     "RMSE": [mean_rmse],
#     "MAPE (%)": [mean_mape]
# })

# print(results_df)
# print("\nMétricas promedio:")
# # print(mean_metrics)

      start_date  end_date       MAE      RMSE        MAPE
0              0       394  0.010203  0.012416  121.834901
1              1       395  0.011273  0.014777  135.185902
2              2       396  0.012507  0.015542  150.508536
3              3       397  0.011998  0.015319  144.236955
4              4       398  0.012261  0.015339  147.313643
...          ...       ...       ...       ...         ...
1582        1582      1976  0.036262  0.048691  517.187585
1583        1583      1977  0.038250  0.050889  546.200059
1584        1584      1978  0.040039  0.051802  571.588074
1585        1585      1979  0.038285  0.049766  546.703093
1586        1586      1980  0.037384  0.048354  534.160323

[1587 rows x 5 columns]

Métricas promedio:
        MAE      RMSE    MAPE (%)
0  0.030549  0.039383  674.642332


### Prophet

In [None]:
# from prophet import Prophet

# # Lista para almacenar los resultados de cada ventana
# results = []

# # Crear ventana deslizante
# for start in range(0, len(df) - train_size - test_size + 1, test_size):
#     # Separar datos de entrenamiento y prueba
#     train = df.iloc[start:start + train_size]
#     test = df.iloc[start + train_size:start + train_size + test_size]
    
#     # Inicializar y entrenar el modelo Prophet en los datos de entrenamiento
#     model = Prophet()
#     model.fit(train)
    
#     # Generar el dataframe de futuro para el período de prueba
#     future = model.make_future_dataframe(periods=test_size, freq='D')
#     forecast = model.predict(future)
    
#     # Extrae solo las predicciones para el período de prueba
#     forecast_test = forecast.iloc[-test_size:][["ds", "yhat"]]
#     forecast_test["y_true"] = test["y"].values  # Añade los valores reales de prueba

#     # Calcular MAE, RMSE y MAPE para esta ventana y almacenar los resultados
#     mae = mean_absolute_error(forecast_test["y_true"], forecast_test["yhat"])
#     rmse = np.sqrt(mean_squared_error(forecast_test["y_true"], forecast_test["yhat"]))
#     mape = mean_absolute_percentage_error(forecast_test["y_true"], forecast_test["yhat"]) * 100
    
#     # Almacena los resultados en la lista
#     results.append({
#         "start_date": train["ds"].iloc[0], 
#         "end_date": test["ds"].iloc[-1], 
#         "MAE": mae, 
#         "RMSE": rmse,
#         "MAPE": mape
#     })

09:57:20 - cmdstanpy - INFO - Chain [1] start processing
09:57:20 - cmdstanpy - INFO - Chain [1] done processing
09:57:20 - cmdstanpy - INFO - Chain [1] start processing


09:57:20 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] start processing
09:57:21 - cmdstanpy - INFO - Chain [1] done processing
09:57:21 - cmdstanpy - INFO - Chain [1] 

In [None]:
# # Convertir resultados en un DataFrame
# results_df = pd.DataFrame(results)

# # Calcular el promedio de cada métrica
# mean_mae = results_df["MAE"].mean()
# mean_rmse = results_df["RMSE"].mean()
# mean_mape = results_df["MAPE"].mean()

# mean_metrics = pd.DataFrame({
#     "MAE": [mean_mae],
#     "RMSE": [mean_rmse],
#     "MAPE (%)": [mean_mape]
# })

# # display(results_df)
# print("\nMétricas promedio:")
# mean_metrics


Métricas promedio:


Unnamed: 0,MAE,RMSE,MAPE (%)
0,0.000685,0.000745,15.985532


### NeuralNetworkLSTM

Necesitamos preparar los datos de una manera que permita a la red procesar secuencias temporales. El modelo LSTM es particularmente efectivo para series temporales, ya que puede aprender dependencias de largo plazo en los datos.

In [3]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

2024-11-20 07:53:53.218907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732085633.270573 2434435 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732085633.286031 2434435 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-20 07:53:53.408860: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Definir los tamaños de las ventanas de entrenamiento y prueba
train_size = 365  # Ejemplo: 1 año de datos para entrenamiento
test_size = 30    # Ejemplo: 1 mes de datos para prueba
input_window_size = 7  # Número de días pasados usados como entrada para predecir el siguiente

# Escalamos los datos
scaler = MinMaxScaler(feature_range=(0, 1))
df['y'] = scaler.fit_transform(df[['y']])

# Lista para almacenar los resultados de cada ventana
results = []

# Crear ventana deslizante
for start in range(0, len(df) - train_size - test_size + 1):
    # Separar datos de entrenamiento y prueba
    train = df.iloc[start:start + train_size]
    test = df.iloc[start + train_size:start + train_size + test_size]
    
    # Preparar los datos para la LSTM
    X_train, y_train = [], []
    for i in range(len(train) - input_window_size):
        X_train.append(train["y"].iloc[i:i + input_window_size].values)
        y_train.append(train["y"].iloc[i + input_window_size])
    
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))  # reshape para LSTM

    # Crear y entrenar el modelo LSTM
    model = Sequential([
        LSTM(50, activation='relu', input_shape=(input_window_size, 1)),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=20, verbose=0)
    
    # Preparar los datos de prueba para predicciones
    X_test, y_test = [], []
    for i in range(len(test) - input_window_size):
        X_test.append(test["y"].iloc[i:i + input_window_size].values)
        y_test.append(test["y"].iloc[i + input_window_size])
    
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))  # reshape para LSTM
    y_test = np.array(y_test)
    
    # Hacer predicciones
    predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions).flatten()  # Desescalar las predicciones
    y_test = scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()  # Desescalar los valores reales
    
    # Calcular MAE, RMSE y MAPE para esta ventana y almacenar los resultados
    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mape = mean_absolute_percentage_error(y_test, predictions) * 100
    
    # Almacena los resultados en la lista
    results.append({
        "start_date": train.index[0], 
        "end_date": test.index[-1], 
        "MAE": mae, 
        "RMSE": rmse,
        "MAPE": mape
    })

2024-11-20 07:53:55.131318: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83

In [5]:
# Convertir resultados en un DataFrame
results_df = pd.DataFrame(results)

# Calcular el promedio de cada métrica
mean_mae = results_df["MAE"].mean()
mean_rmse = results_df["RMSE"].mean()
mean_mape = results_df["MAPE"].mean()

mean_metrics = pd.DataFrame({
    "Model": ['LSTM'],
    "Cluster": ['0-9'],
    "MAE": [mean_mae],
    "RMSE": [mean_rmse],
    "MAPE (%)": [mean_mape]
})

print(results_df)
print("\nMétricas promedio:")
mean_metrics

      start_date  end_date       MAE      RMSE      MAPE
0              0       394  0.000550  0.000710  5.091506
1              1       395  0.000629  0.000820  5.727103
2              2       396  0.000750  0.000935  6.762461
3              3       397  0.000563  0.000752  5.151147
4              4       398  0.000574  0.000763  5.223930
...          ...       ...       ...       ...       ...
1261        1261      1655  0.000863  0.001060  2.609604
1262        1262      1656  0.000823  0.001020  2.511323
1263        1263      1657  0.000778  0.000969  2.405186
1264        1264      1658  0.000763  0.000973  2.355797
1265        1265      1659  0.000913  0.001072  2.899719

[1266 rows x 5 columns]

Métricas promedio:


Unnamed: 0,Model,Cluster,MAE,RMSE,MAPE (%)
0,LSTM,0-9,0.000834,0.000994,4.515943


In [6]:
mean_metrics.to_csv('results_neuralLSTM/NeuralNetworkLSTM_sub_0_9.csv', index=False)

In [7]:
# Get a list of all CSV files in the directory
csv_files = sorted(glob.glob('results_neuralLSTM/NeuralNetworkLSTM_sub_0_[0-9].csv'))

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    df_temp = pd.read_csv(file)
    dfs.append(df_temp)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Optional: Reset the index of the final DataFrame
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Model,Cluster,MAE,RMSE,MAPE (%)
0,LSTM,0-0,0.00096,0.001148,3.919663
1,LSTM,0-1,0.000875,0.00104,4.638729
2,LSTM,0-2,0.001065,0.00126,3.989844
3,LSTM,0-3,0.000947,0.00113,4.283326
4,LSTM,0-4,0.002145,0.002675,15.656267
5,LSTM,0-5,0.000807,0.000977,4.05601
6,LSTM,0-6,0.000863,0.001018,4.190122
7,LSTM,0-7,0.000992,0.001189,4.715693
8,LSTM,0-8,0.000967,0.001157,3.942132
9,LSTM,0-9,0.000834,0.000994,4.515943


In [8]:
df.to_csv('results_neuralLSTM/NeuralNetworkLSTM_res_sub0.csv', index=False)

In [50]:
train

Unnamed: 0,ds,y
1586,2023-03-18,0.204968
1587,2023-03-19,0.211014
1588,2023-03-20,0.224457
1589,2023-03-21,0.222198
1590,2023-03-22,0.221395
...,...,...
1946,2024-03-12,0.464583
1947,2024-03-13,0.461759
1948,2024-03-14,0.459748
1949,2024-03-15,0.449475


In [53]:
test

Unnamed: 0,ds,y
1951,2024-03-17,0.43225
1952,2024-03-18,0.44426
1953,2024-03-19,0.434347
1954,2024-03-20,0.458445
1955,2024-03-21,0.459348
1956,2024-03-22,0.454035
1957,2024-03-23,0.448474
1958,2024-03-24,0.44531
1959,2024-03-25,0.446703
1960,2024-03-26,0.465699
