
# Henry Hub Gas Price Forecast (Daily)

Forecast next-day Henry Hub spot gas to feed energy-price models.



## Workflow
1. Load daily Henry Hub series.
2. Engineer lagged features, rolling stats, and calendar terms.
3. Train XGBoost to predict next-day price; evaluate MAE/RMSE.
4. Generate a 14-day iterative forecast and export for downstream use.


In [1]:

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

try:
    import xgboost as xgb
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'xgboost'])
    import xgboost as xgb

DATA_PATH = Path('data/daily_henryhub_naturalgasspotprice.csv')
EXPORT_PATH = Path('output_data/gas_forecast_daily.csv')


## Load and prepare daily gas data

In [2]:

gas = pd.read_csv(DATA_PATH, encoding='utf-8-sig')
gas = gas.rename(columns={'Henry Hub Natural Gas Spot Price (Dollars per Million Btu)': 'gas_price'})
gas['date'] = pd.to_datetime(gas['Date'])
gas['gas_price'] = pd.to_numeric(gas['gas_price'], errors='coerce')
gas = gas.dropna(subset=['gas_price']).copy()

gas = gas.sort_values('date').set_index('date')
gas['lag_1'] = gas['gas_price'].shift(1)
gas['lag_7'] = gas['gas_price'].shift(7)
gas['lag_30'] = gas['gas_price'].shift(30)
gas['rolling_7'] = gas['gas_price'].rolling(7).mean()
gas['rolling_30'] = gas['gas_price'].rolling(30).mean()
gas['vol_7'] = gas['gas_price'].rolling(7).std()
gas['vol_30'] = gas['gas_price'].rolling(30).std()
gas['dayofyear'] = gas.index.dayofyear
gas['doy_sin'] = np.sin(2 * np.pi * gas['dayofyear'] / 365.25)
gas['doy_cos'] = np.cos(2 * np.pi * gas['dayofyear'] / 365.25)
gas['month'] = gas.index.month

gas['target'] = gas['gas_price'].shift(-1)
gas = gas.dropna().copy()
feature_cols = ['gas_price', 'lag_1', 'lag_7', 'lag_30', 'rolling_7', 'rolling_30', 'vol_7', 'vol_30', 'doy_sin', 'doy_cos', 'month']
X = gas[feature_cols]
y = gas['target']

split_idx = int(len(gas) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

gas_model = xgb.XGBRegressor(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)
gas_model.fit(X_train, y_train)

train_pred = gas_model.predict(X_train)
test_pred = gas_model.predict(X_test)
metrics = {
    'train_mae': mean_absolute_error(y_train, train_pred),
    'train_rmse': mean_squared_error(y_train, train_pred, squared=False),
    'test_mae': mean_absolute_error(y_test, test_pred),
    'test_rmse': mean_squared_error(y_test, test_pred, squared=False)
}
metrics


  gas['date'] = pd.to_datetime(gas['Date'])


{'train_mae': np.float64(0.07384108209965402),
 'train_rmse': np.float64(0.0999768438087536),
 'test_mae': np.float64(0.23725477099336026),
 'test_rmse': np.float64(0.6638960674688891)}

## Persist metrics

In [3]:
import json
metrics_path = Path('output_data/gas_forecast_metrics.json')
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2)
metrics_path

PosixPath('output_data/gas_forecast_metrics.json')

## 14-day iterative forecast

In [4]:

forecast_horizon = 14
last_date = gas.index.max()
forecast_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=forecast_horizon, freq='D')

history = gas[['gas_price']].copy()
predictions = []
for date in forecast_dates:
    features = {}
    features['gas_price'] = history.iloc[-1]['gas_price']
    features['lag_1'] = history.iloc[-1]['gas_price']
    features['lag_7'] = history.iloc[-7]['gas_price'] if len(history) >= 7 else history.iloc[-1]['gas_price']
    features['lag_30'] = history.iloc[-30]['gas_price'] if len(history) >= 30 else history.iloc[-1]['gas_price']
    features['rolling_7'] = history['gas_price'].tail(7).mean()
    features['rolling_30'] = history['gas_price'].tail(30).mean()
    features['vol_7'] = history['gas_price'].tail(7).std()
    features['vol_30'] = history['gas_price'].tail(30).std()
    dayofyear = date.timetuple().tm_yday
    features['doy_sin'] = np.sin(2 * np.pi * dayofyear / 365.25)
    features['doy_cos'] = np.cos(2 * np.pi * dayofyear / 365.25)
    features['month'] = date.month
    X_future = pd.DataFrame([features])[feature_cols]
    pred = float(gas_model.predict(X_future))
    predictions.append(pred)
    history.loc[date] = pred

forecast_df = pd.DataFrame({
    'date': forecast_dates,
    'predicted_gas_price': predictions
})
forecast_df.head()


  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))
  pred = float(gas_model.predict(X_future))


Unnamed: 0,date,predicted_gas_price
0,2025-11-01,3.543808
1,2025-11-02,3.559819
2,2025-11-03,3.571776
3,2025-11-04,3.580763
4,2025-11-05,3.600846


## Export forecast

In [5]:

forecast_df.to_csv(EXPORT_PATH, index=False)
EXPORT_PATH


PosixPath('output_data/gas_forecast_daily.csv')