In [1]:
import pandas as pd
import numpy as np
import optuna

from datetime import datetime
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../Data/transformed/transformed_open_meteo.csv')

In [3]:
df.head(5)

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,windspeed_10m_max,provincia,sin_day_of_year,cos_day_of_year
0,2013-01-01,12.3,9.3,10.8,1.2,19.8,A Coruña,0.017166,0.999853
1,2013-01-02,12.4,6.0,9.1,0.0,20.4,A Coruña,0.034328,0.999411
2,2013-01-03,16.1,6.4,10.1,0.0,17.1,A Coruña,0.051479,0.998674
3,2013-01-04,15.2,5.0,9.0,0.0,11.3,A Coruña,0.068615,0.997643
4,2013-01-05,14.2,4.1,8.2,0.0,9.7,A Coruña,0.085731,0.996318


In [4]:
pred_variables = ['temperature_2m_max','temperature_2m_min','temperature_2m_mean', 'precipitation_sum', 'windspeed_10m_max']

In [5]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
df.set_index('date', inplace=True)

In [6]:
if 'day_of_year' not in df.columns:
    df['day_of_year'] = df.index.dayofyear
df['sin_day_of_year'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['cos_day_of_year'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
X = df[['sin_day_of_year', 'cos_day_of_year']]

In [7]:
print("Primeros 3 registros de la serie:")
print(df.head(3))

Primeros 3 registros de la serie:
            temperature_2m_max  temperature_2m_min  temperature_2m_mean  \
date                                                                      
2013-01-01                12.3                 9.3                 10.8   
2013-01-01                10.0                 4.9                  7.2   
2013-01-01                18.9                14.9                 16.7   

            precipitation_sum  windspeed_10m_max   provincia  sin_day_of_year  \
date                                                                            
2013-01-01                1.2               19.8    A Coruña         0.017213   
2013-01-01                4.9               20.7    La Rioja         0.017213   
2013-01-01                0.0               22.6  Las Palmas         0.017213   

            cos_day_of_year  day_of_year  
date                                      
2013-01-01         0.999852            1  
2013-01-01         0.999852            1  
2013-01-01  

In [8]:
X_train, X_test, y_dummy_train, y_dummy_test = train_test_split(X, df[pred_variables[0]], test_size=0.2, shuffle=False)
print(f"Observaciones en entrenamiento: {len(X_train)}")
print(f"Observaciones en test: {len(X_test)}")

Observaciones en entrenamiento: 184636
Observaciones en test: 46160


In [9]:
results = {}

In [None]:
for var in pred_variables:
    print(f"\n=== Procesando la variable: {var} ===")
    
    # Extraer la serie objetivo
    y = df[var]
    
    # Dividir la serie en entrenamiento y test utilizando la misma partición definida anteriormente
    y_train = y.loc[X_train.index]
    y_test  = y.loc[X_test.index]
    
    # Adicionalmente, dividir el conjunto de entrenamiento en subentrenamiento y validación para la optimización
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)
    print(f"Subentrenamiento: {len(y_train_sub)}, Validación: {len(y_val)}")
    
    # 4.1. Definir la función objetivo para Optuna (para la variable actual)
    def objective(trial):
        # Espacio de búsqueda para los hiperparámetros de SARIMAX
        p = trial.suggest_int("p", 0, 3)
        d = trial.suggest_int("d", 0, 2)
        q = trial.suggest_int("q", 0, 3)
        # Parámetros para el componente estacional
        P = trial.suggest_int("P", 0, 1)
        D = trial.suggest_int("D", 0, 1)
        Q = trial.suggest_int("Q", 0, 1)
        m = 1  # Como usamos variables exógenas para capturar la estacionalidad, m se fija en 1

        try:
            # Entrenar modelo SARIMAX en el subentrenamiento
            model = SARIMAX(y_train_sub, exog=X_train_sub, order=(p, d, q), seasonal_order=(P, D, Q, m))
            model_fit = model.fit(disp=False)
            # Predecir en el conjunto de validación
            y_pred_val = model_fit.predict(start=y_val.index[0], end=y_val.index[-1], exog=X_val)
            rmse = mean_squared_error(y_val, y_pred_val, squared=False)
        except Exception as e:
            rmse = float('inf')
        return rmse

    # 4.2. Optimización de hiperparámetros con Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)
    
    best_params = study.best_params
    print("Mejores hiperparámetros encontrados:")
    print(best_params)
    print(f"Mejor RMSE en validación: {study.best_value:.2f}")
    
    # 4.3. Entrenar el modelo final con los mejores hiperparámetros en todo el conjunto de entrenamiento
    p, d, q = best_params["p"], best_params["d"], best_params["q"]
    P, D, Q = best_params["P"], best_params["D"], best_params["Q"]
    m = 1
    
    final_model = SARIMAX(y_train, exog=X_train, order=(p, d, q), seasonal_order=(P, D, Q, m))
    final_model_fit = final_model.fit(disp=False)
    
    # 4.4. Predecir en el conjunto de test
    y_test_pred = final_model_fit.predict(start=y_test.index[0], end=y_test.index[-1], exog=X_test)
    
    # 4.5. Evaluar el modelo final en test usando métricas de sklearn
    mae = mean_absolute_error(y_test, y_test_pred)
    rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    print(f"Evaluación en test para {var}: MAE = {mae:.2f}, RMSE = {rmse:.2f}")
    
    # Almacenar resultados
    results[var] = {
        'model': final_model_fit,
        'best_params': best_params,
        'y_test': y_test,
        'y_pred': y_test_pred,
        'mae': mae,
        'rmse': rmse
    }

# Opcional: visualizar un resumen de resultados
print("\nResumen final de métricas:")
for var, res in results.items():
    print(f"{var} - MAE: {res['mae']:.2f}, RMSE: {res['rmse']:.2f}")


=== Procesando la variable: temperature_2m_max ===


ValueError: Found input variables with inconsistent numbers of samples: [184636, 9601072]