In [1]:
import sys
print(sys.executable)


d:\2025\Andes\venv\Scripts\python.exe


In [2]:
import xgboost as xgb

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import matplotlib.pyplot as plt

In [5]:
# 1. Cargar datos
df_final = pd.read_csv('d:/2025/Andes/df_final.csv', index_col=0, parse_dates=True)

In [6]:
# 2. Escalar datos
scaler = MinMaxScaler()
dataset_norm = scaler.fit_transform(df_final.values)
scaler_y = MinMaxScaler()
y_norm = scaler_y.fit_transform(df_final[['price actual']].values)

In [7]:
# 3. Separar features y target
features = dataset_norm[:, :-1]
target = dataset_norm[:, -1]

In [8]:
# 4. Definir parámetros
past_history = 168  # o 720 para predicción mensual
future_target = 168  # una semana (7 días * 24 horas)
step = 1

In [9]:
# 5. Generar secuencias multivariadas
def multivariate_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i - history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i + target_size])
        else:
            labels.append(target[i : i + target_size])

    return np.array(data), np.array(labels)

In [10]:
# 6. Aplicar función
X_seq, y_seq = multivariate_data(features, target,
                                 start_index=0,
                                 end_index=None,
                                 history_size=past_history,
                                 target_size=future_target,
                                 step=step,
                                 single_step=False)

In [11]:
# 7. Dividir en entrenamiento, validación y prueba
split1 = int(len(X_seq) * 0.7)
split2 = int(len(X_seq) * 0.9)
X_train, y_train = X_seq[:split1], y_seq[:split1]
X_val, y_val = X_seq[split1:split2], y_seq[split1:split2]
X_test, y_test = X_seq[split2:], y_seq[split2:]

In [12]:
X_train_xgb = X_train.reshape(-1, X_train.shape[1] * X_train.shape[2])
X_val_xgb = X_val.reshape(-1, X_val.shape[1] * X_val.shape[2])
X_test_xgb = X_test.reshape(-1, X_test.shape[1] * X_test.shape[2])

In [None]:
param = {'eta': 0.02, 'max_depth': 100,
         'subsample': 1.0, 'colsample_bytree': 0.95,
         'alpha': 0.1, 'lambda': 0.15, 'gamma': 0.1,
         'objective': 'reg:linear', 'eval_metric': 'rmse',
         'verbosity': 0, 'min_child_weight': 0.1, 'n_jobs': -1}

dtrain = xgb.DMatrix(X_train_xgb, y_train)
dval = xgb.DMatrix(X_val_xgb, y_val)
dtest = xgb.DMatrix(X_test_xgb, y_test)

eval_list = [(dtrain, 'train'), (dval, 'eval')]
xgb_model = xgb.train(param, dtrain, num_boost_round=100, evals=eval_list, early_stopping_rounds=5)

In [None]:
# Create and fit scaler_y before scaling y_test
scaler_y = MinMaxScaler()
# Assuming y_train is your training data for the target variable
scaler_y.fit(y_train.reshape(-1, 1))

In [None]:
y_test_inv = scaler_y.inverse_transform(y_test.reshape(-1, 1))
xgb_forecast = xgb_model.predict(dtest).reshape(-1, 1)
xgb_forecast_inv = scaler_y.inverse_transform(xgb_forecast)

# Métricas de evaluación
rmse_xgb = sqrt(mean_squared_error(y_test_inv, xgb_forecast_inv))
print(f'RMSE del forecast XGBoost (una hora adelante): {rmse_xgb:.3f}')

In [None]:
# Visualización de las primeras 100 predicciones
index_range = range(100)
plt.figure(figsize=(10, 6))
plt.plot(index_range, y_test_inv[index_range], label='Valores Reales', marker='.', color='blue')
plt.plot(index_range, xgb_forecast_inv[index_range], label='Predicciones XGBoost', marker='x', color='red')
plt.title('Valores Reales vs Predicciones XGBoost (100 primeros)')
plt.xlabel('Tiempo')
plt.ylabel('Precio')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Predicciones futuras (100 pasos)
#last_date = df_final['time'].max() #'time' is not a column anymore
last_date = df_final.index.max()  # Accessing the maximum value of the index
future_dates = [last_date + pd.Timedelta(hours=i) for i in range(1, 101)]

predictions_df = pd.DataFrame({
    'DATETIME': future_dates,
    'Predicted_Price': xgb_forecast_inv.flatten()[:100]
})

plt.figure(figsize=(15, 6))
#plt.plot(df_final['time'], df_final['price actual'], label='Histórico Precio', color='blue') #'time' is not a column anymore
plt.plot(df_final.index, df_final['price actual'], label='Histórico Precio', color='blue') #using the index instead of the column name
plt.plot(predictions_df['DATETIME'], predictions_df['Predicted_Price'], label='Predicción (100 horas)', color='red', linestyle='--')
plt.xlabel('Fecha')
plt.ylabel('Precio')
plt.title('Precio Real vs Predicción XGBoost para las próximas 100 horas')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Flatten input for XGBoost (tabular format)
X_train_xgb = X_train_seq.reshape(-1, past_history * X_train_seq.shape[2])
X_val_xgb = X_val_seq.reshape(-1, past_history * X_val_seq.shape[2])
X_test_xgb = X_test_seq.reshape(-1, past_history * X_test_seq.shape[2])

# Flatten targets (each row is 168-step ahead targets)
y_train_xgb = y_train_seq
y_val_xgb = y_val_seq
y_test_xgb = y_test_seq

In [None]:
# Entrenamos un modelo por cada paso futuro (1 a 168)
xgb_models = []
xgb_preds = []

for i in range(future_target):
    dtrain = xgb.DMatrix(X_train_xgb, label=y_train_xgb[:, i])
    dval = xgb.DMatrix(X_val_xgb, label=y_val_xgb[:, i])
    dtest = xgb.DMatrix(X_test_xgb)

    param = {
        'eta': 0.03, 'max_depth': 8,
        'subsample': 0.8, 'colsample_bytree': 0.8,
        'alpha': 0.1, 'lambda': 0.15, 'gamma': 0.1,
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'n_jobs': -1, 'verbosity': 0
    }

    eval_list = [(dtrain, 'train'), (dval, 'eval')]

    model = xgb.train(param, dtrain, num_boost_round=100, evals=eval_list, early_stopping_rounds=5, verbose_eval=False)
    preds = model.predict(dtest)

    xgb_models.append(model)
    xgb_preds.append(preds)

In [None]:
# Convertir predicciones a array 2D: (n_samples, 168)
xgb_preds = np.stack(xgb_preds, axis=1)

# Invertir la escala
xgb_preds_inv = scaler_y.inverse_transform(xgb_preds)
y_test_inv = scaler_y.inverse_transform(y_test_xgb)

In [None]:
# Métricas
rmse = sqrt(mean_squared_error(y_test_inv.flatten(), xgb_preds_inv.flatten()))
mae = mean_absolute_error(y_test_inv.flatten(), xgb_preds_inv.flatten())
print(f"XGBoost Multistep RMSE: {rmse:.4f}, MAE: {mae:.4f}")

In [None]:
# --- Gráfico completo ---
plt.figure(figsize=(14, 6))
plt.plot(y_test_inv[0], label='Real', color='lightblue', linewidth=2)
plt.plot(xgb_preds_inv[0], label='Predicción XGBoost (168 pasos)', color='black', linestyle='--', linewidth=2)
plt.title('Predicción XGBoost - Primer ejemplo multistep')
plt.xlabel('Horas Futuras')
plt.ylabel('Precio')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# --- Gráfico con zoom en el último mes (si disponible) ---
hist_len = 720  # 30 días

plt.figure(figsize=(14, 6))
plt.plot(range(hist_len), y_test_inv[0][-hist_len:], label='Histórico reciente', color='lightgray')
plt.plot(range(hist_len, hist_len + future_target), xgb_preds_inv[0], label='Predicción', color='black')
plt.title('Zoom: Histórico reciente + 7 días predicción XGBoost')
plt.xlabel('Horas')
plt.ylabel('Precio')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()